We consider that it is extremely valuable to share our data analysis with the Data Science community to improve our field education and exchange feedback/comments.
Exploring the data
In order to understand better the data we are dealing with, we check distribution, box plots, explore correlation and dependence of variables.
Histograms
par(mfrow=c(3,2))
hist( Absenteeism_withcatnames_wth0$Transportation.expense, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Transportation.expense ) ), probability = TRUE,
col = 'lavender', main = 'Transportation.expense', xlab = 'Transportation.expense' )
boxplot(Absenteeism_withcatnames_wth0$Transportation.expense) #585,586 rows (id31)
hist( Absenteeism_withcatnames_wth0$Distance.from.Residence.to.Work, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Distance.from.Residence.to.Work ) ), probability = TRUE,
col = 'lavender', main = 'Distance.from.Residence.to.Work', xlab = 'Distance.from.Residence.to.Work' )
boxplot(Absenteeism_withcatnames_wth0$Distance.from.Residence.to.Work)
hist( Absenteeism_withcatnames_wth0$Service.time, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Service.time ) ), probability = TRUE,
col = 'lavender', main = 'Service.time', xlab = 'Service.time' )
boxplot(Absenteeism_withcatnames_wth0$Service.time) #587, 588, 598, 590,591 (ID32)

par(mfrow=c(3,2))
hist( Absenteeism_withcatnames_wth0$Age, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Age ) ), probability = TRUE,
col = 'lavender', main = 'Age', xlab = 'Age' )
boxplot(Absenteeism_withcatnames_wth0$Age) #165 /6/7/8/9/70/71/72 (ID9)
hist( Absenteeism_withcatnames_wth0$Son, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Son ) ), probability = TRUE,
col = 'lavender', main = 'Son', xlab = 'Son' )
boxplot(Absenteeism_withcatnames_wth0$Son)
hist( Absenteeism_withcatnames_wth0$Pet, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Pet ) ), probability = TRUE,
col = 'lavender', main = 'Pet', xlab = 'Pet' )
boxplot(Absenteeism_withcatnames_wth0$Pet)#ID 12,2,10,23

par(mfrow=c(3,2))
hist( Absenteeism_withcatnames_wth0$Weight, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Weight ) ), probability = TRUE,
col = 'lavender', main = 'Weight', xlab = 'Weight' )
boxplot(Absenteeism_withcatnames_wth0$Weight)
hist( Absenteeism_withcatnames_wth0$Height, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Height ) ), probability = TRUE,
col = 'lavender', main = 'Height', xlab = 'Height' )
boxplot(Absenteeism_withcatnames_wth0$Height)# ID 14, 30, 29, 18, 12, 36, 25, 31
hist( Absenteeism_withcatnames_wth0$BMI, breaks = sqrt( length( Absenteeism_withcatnames_wth0$BMI ) ), probability = TRUE,
col = 'lavender', main = 'BMI', xlab = 'BMI' )
boxplot(Absenteeism_withcatnames_wth0$BMI)

par(mfrow=c(3,2))
hist( Absenteeism_withcatnames_wth0$Freq.absence, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Freq.absence) ), probability = TRUE,
col = 'lavender', main = 'Freq.absence', xlab = 'Freq.absence' )
boxplot(Absenteeism_withcatnames_wth0$Freq.absence)
hist( Absenteeism_withcatnames_wth0$Freq.failure, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Freq.failure ) ), probability = TRUE,
col = 'lavender', main = 'Freq.failure', xlab = 'Freq.failure' )
boxplot(Absenteeism_withcatnames_wth0$Freq.failure) #ID 36
hist( Absenteeism_withcatnames_wth0$First.start, breaks = sqrt( length( Absenteeism_withcatnames_wth0$First.start ) ), probability = TRUE,
col = 'lavender', main = 'First.start', xlab = 'First.start' )
boxplot(Absenteeism_withcatnames_wth0$First.start) #id 9 , 31

par(mfrow=c(3,2))
hist( Absenteeism_withcatnames_wth0$Hit.target, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Hit.target ) ), probability = TRUE,
col = 'lavender', main = 'Hit.target', xlab = 'Hit target' )
boxplot(Absenteeism_withcatnames_wth0$Hit.target) #691,152,452,462,285,293,316,317,41,43,46,102,106,375,213 (it changes so it is not for ID)
hist( Absenteeism_withcatnames_wth0$Absenteeism.time.in.hours, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Absenteeism.time.in.hours ) ), probability = TRUE,
col = 'lavender', main = 'Absenteeism.time.in.hours', xlab = 'Absenteeism.time.in.hours' )
boxplot(Absenteeism_withcatnames_wth0$Absenteeism.time.in.hours) #if >24 outliers
#table(Absenteeism_withcatnames_wth0$Absenteeism.time.in.hours)
hist( Absenteeism_withcatnames_wth0$Hour.Work.load.Average.day, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Hour.Work.load.Average.day ) ), probability = TRUE,
col = 'lavender', main = 'Hour.Work.load.Average.day', xlab = 'Hour.Work.load.Average.day' )
boxplot(Absenteeism_withcatnames_wth0$Hour.Work.load.Average.day) #533,589,280,105,56,4,173,477,451,449,466,445,311,683,693,685,574,255,263,262,498,82,61, 78,199,2017,183,669

par(mfrow=c(1,2))
hist( Absenteeism_withcatnames_wth0$Number.of.days.absent, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Number.of.days.absent ) ), probability = TRUE,
col = 'lavender', main = 'Number.of.days.absent', xlab = 'Number.of.days.absent' )
boxplot(Absenteeism_withcatnames_wth0$Number.of.days.absent) #same as absenteeism time in hours

par(mfrow=c(4,2))
Absenteeism_withcatnames_wth0$ID=as.factor(Absenteeism_withcatnames_wth0$ID)
Absenteeism_withcatnames_wth0$Bad.habits=as.factor(Absenteeism_withcatnames_wth0$Bad.habits)
#str(Absenteeism_withcatnames_wth0)
barplot(table(Absenteeism_withcatnames_wth0$Day.of.the.week.nom),
col = 'lavender')
barplot(table(Absenteeism_withcatnames_wth0$Month.of.absence.nom),
col = 'lavender')
barplot(table(Absenteeism_withcatnames_wth0$Seasons.nom),
col = 'lavender')
barplot(table(Absenteeism_withcatnames_wth0$Bad.habits),
col = 'lavender')
barplot(table(Absenteeism_withcatnames_wth0$Education),
col = 'lavender')
barplot(table(Absenteeism_withcatnames_wth0$ID),
col = 'lavender')
barplot(table(Absenteeism_withcatnames_wth0$Reason.for.absence),
col = 'lavender')

Q-Q plots
Absent2=Absenteeism_withcatnames_wth0
#Transportation.expense
qqnorm (Absent2$Transportation.expense)
qqline (Absent2$Transportation.expense)
#Distance.from.Residence.to.Work
qqnorm (Absent2$Distance.from.Residence.to.Work)
qqline (Absent2$Distance.from.Residence.to.Work)
#Service.time
qqnorm (Absent2$Service.time)
qqline (Absent2$Service.time)
#Age
qqnorm (Absent2$Age)
qqline (Absent2$Age)
#Hit.target
qqnorm (Absent2$Hit.target)
qqline (Absent2$Hit.target)
#Son
qqnorm (Absent2$Son)
qqline (Absent2$Son)
#Pet
qqnorm (Absent2$Pet)
qqline (Absent2$Pet)
#Weight
qqnorm (Absent2$Weight)
qqline (Absent2$Weight)
#Height
qqnorm (Absent2$Height)
qqline (Absent2$Height)
#BMI
qqnorm (Absent2$BMI)
qqline (Absent2$BMI)
#Absenteeism.time.in.hours
qqnorm (Absent2$Absenteeism.time.in.hours)
qqline (Absent2$Absenteeism.time.in.hours)
#Freq.absence
qqnorm (Absent2$Freq.absence)
qqline (Absent2$Freq.absence)
#Hour.Work.load.Average.day
qqnorm (Absent2$Hour.Work.load.Average.day)
qqline (Absent2$Hour.Work.load.Average.day)
#Number.of.days.absent
qqnorm (Absent2$Number.of.days.absent)
qqline (Absent2$Number.of.days.absent)
#Freq.failure
qqnorm (Absent2$Freq.failure)
qqline (Absent2$Freq.failure)
#First.start
qqnorm (Absent2$First.start)
qqline (Absent2$First.start)
Subset Variable numerical and categorical
Absent2=Absenteeism_withcatnames_wth0
#Numerical
Absent2.num <- subset (Absent2, select=c(Transportation.expense,Distance.from.Residence.to.Work,Service.time,Age,Hit.target,Son,Pet,Weight,Height,BMI,Absenteeism.time.in.hours,Freq.absence,Hour.Work.load.Average.day,Number.of.days.absent,Freq.failure,First.start))
#Categorical
Absent2.cat <- subset (Absent2, select=-c(Transportation.expense,Distance.from.Residence.to.Work,Service.time,Age,Hit.target,Son,Pet,Weight,Height,BMI,Absenteeism.time.in.hours,Freq.absence,Hour.Work.load.Average.day,Number.of.days.absent,Freq.failure,First.start))
Correlations
Absent2.num.scaled <- scale(Absent2.num, center=TRUE, scale=TRUE)
library(knitr)
#Pearson correlation
corvarPearson <- round(cor(Absent2.num.scaled),2)
corvarPearson[corvarPearson > -0.5 & corvarPearson < 0.5] <- NA
#View(corvarPearson)
#Spearman correlation
corvarSpearman <- round(cor(Absent2.num.scaled, method="spearman"),2)
corvarSpearman[corvarSpearman > -0.5 & corvarSpearman < 0.5] <- NA
#View(corvarSpearman)
library(corrplot)
par(mfrow=c(1,2))
CorrMatrix <- data.matrix(Absent2.num.scaled)
corrplot(cor(CorrMatrix), diag = FALSE, order = "FPC", tl.pos = "td", tl.cex = 0.7, method ="color", type = "upper",number.cex = .6)
corrplot(cor(CorrMatrix, method="spearman"), diag = FALSE, order = "FPC", tl.pos = "td", tl.cex = 0.7, method = "color", type = "upper",number.cex = .6)

- Service and Age are positively correlated (Pearson’s correlation = 0.68 and Spearman = 0.78)
- Age and First.start are positively correlated (Pearson’s correlation = 0.70 and Spearman = 0.57)
- Weight and BMI are positively correlated (Pearson’s correlation = 0.90 and Spearman = 0.88)
- Absenteeism.time.in.hours and Number.of.days.absent are positively correlated (Pearson’s correlation = 0.98 and Spearman = 0.97)
require(car)
Absent2.num.scaled.corr <- subset(Absent2.num.scaled, select= c(Service.time, Age, First.start, Weight, BMI, Absenteeism.time.in.hours, Number.of.days.absent))
scatterplotMatrix(Absent2.num.scaled.corr)

Removing possible outliers
Rare values can create bias in further analysis by appearing to be more important than they really are. For this reason, we performed an analysis in the variables which could change for the same ID: otherwise, we will lose a relatively huge amount of data. Hit.target, Absenteeism.time.in.hours, Hour.Work.load.Average.day, Number.of.days.absent are the variables analyzed. In particular, Number.of.days.absent is really high positively correlated with the hours, then to detect outliers for them is possible to consider just one of them.
The outliers detected by IQR’s method of the three variables Hit.target, Absenteeism.time.in.hours, Hour.Work.load.Average.day were around 11%:
par(mfrow=c(1,3))
boxplot(Absenteeism_withcatnames_wth0$Hit.target, main='Hit.target')
boxplot(Absenteeism_withcatnames_wth0$Absenteeism.time.in.hours, main='Absenteeism.time.in.hours')
boxplot(Absenteeism_withcatnames_wth0$Hour.Work.load.Average.day, main='Hour.Work.load.Average.day')

Absenteeism_withcatnames_wth0Hit=subset(Absenteeism_withcatnames_wth0, Absenteeism_withcatnames_wth0$Hit.target>85)
Absenteeism_withcatnames_wth0HitHourAbs=subset(Absenteeism_withcatnames_wth0Hit, Absenteeism_withcatnames_wth0Hit$Absenteeism.time.in.hours<=20 )
Absenteeism_withcatnames_wth0HitHourAbsWorkLoad=subset(Absenteeism_withcatnames_wth0HitHourAbs, Absenteeism_withcatnames_wth0HitHourAbs$Hour.Work.load.Average.day<6.0)
#dim(Absenteeism_withcatnames_wth0HitHourAbsWorkLoad)
#(696-616)/696
Absent2.num.scaled=as.data.frame(Absent2.num.scaled)
require(scatterplot3d)
scatterplot3d(Absent2.num.scaled$Absenteeism.time.in.hours, Absent2.num.scaled$Hour.Work.load.Average.day, Absent2.num.scaled$Hit.target, xlab='Absenteeism.time.in.hours',ylab='Hour.Work.load.Average.day',zlab='Hit.target', grid=TRUE, box=TRUE)

Since these three variables are not correlated, it is possible to see, for example, that for the same value of Work.load.average.day a point can be an outlier for Absenteeism.time.in.hours and for Hit.target or/and not. In particular, if you can read the plot better in 2d:
layout(matrix(c(2,0,1,3),nrow=2,byrow=T), widths=c(2,1),heights=c(1,2),respect=T)
plot(Absenteeism_withcatnames_wth0$Absenteeism.time.in.hours, Absenteeism_withcatnames_wth0$Hour.Work.load.Average.day, xlab='Absenteeism.time.in.hours',ylab='Hour.Work.load.Average.day')
hist( Absenteeism_withcatnames_wth0$Absenteeism.time.in.hours, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Absenteeism.time.in.hours ) ), probability = TRUE,
col = 'lavender', main = '', xlab = '' )
boxplot(Absenteeism_withcatnames_wth0$Hour.Work.load.Average.day)

layout(matrix(c(2,0,1,3),nrow=2,byrow=T), widths=c(2,1),heights=c(1,2),respect=T)
plot(Absenteeism_withcatnames_wth0$Hit.target, Absenteeism_withcatnames_wth0$Hour.Work.load.Average.day, xlab='Hit.target',ylab='Hour.Work.load.Average.day')
hist( Absenteeism_withcatnames_wth0$Hit.target, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Hit.target ) ), probability = TRUE,
col = 'lavender', main = '', xlab = '' )
boxplot(Absenteeism_withcatnames_wth0$Hour.Work.load.Average.day)

layout(matrix(c(2,0,1,3),nrow=2,byrow=T), widths=c(2,1),heights=c(1,2),respect=T)
plot(Absenteeism_withcatnames_wth0$Absenteeism.time.in.hours, Absenteeism_withcatnames_wth0$Hit.target, xlab='Absenteeism.time.in.hours',ylab='Hit.target')
hist( Absenteeism_withcatnames_wth0$Absenteeism.time.in.hours, breaks = sqrt( length( Absenteeism_withcatnames_wth0$Absenteeism.time.in.hours ) ), probability = TRUE,
col = 'lavender', main = '', xlab = '' )
boxplot(Absenteeism_withcatnames_wth0$Hit.target)

Saying this, we considered as outliers Hit.target<85 and Absenteeism.time.in.hours>48, corresponding to 4,1% of the data set.
Absent_outliers <- Absenteeism_withcatnames_wth0
#take out the outliers:
Absent_outliers=subset(Absent_outliers, Absent_outliers$Hit.target>85) #-15 rows
Absent_outliers=subset(Absent_outliers, Absent_outliers$Absenteeism.time.in.hours<=48 ) #-14 rows
(15+14)/696
## [1] 0.04166667
It is not possible to apply the bi-variate (Hit.target and Absenteeism.time.in.hours) plot with the two ellipses, neither the convex hull or stalac since those variables are numeric discrete.
#controling the outliers removed:
Absent_outliersRemoved <- Absenteeism_withcatnames_wth0
Absent_outliersRemovedHit=subset(Absent_outliersRemoved , Absent_outliersRemoved $Hit.target<=85)
Absent_outliersRemovedHour=subset(Absent_outliersRemoved,Absent_outliersRemoved$Absenteeism.time.in.hours>48)
Pearson’s chi-square test
#Qui-square
#str(Absent2.cat)
chisq <- table(Absent2.cat$ID, Absent2.cat$Reason.for.absence.short)
chisq.test(chisq)
##
## Pearson's Chi-squared test
##
## data: chisq
## X-squared = 937.75, df = 288, p-value < 2.2e-16
#ID and Reason for absence are dependent
chisq2 <- table(Absent2.cat$ID, Absent2.cat$Education)
chisq.test(chisq2)
##
## Pearson's Chi-squared test
##
## data: chisq2
## X-squared = 1392, df = 64, p-value < 2.2e-16
#ID and Education are dependent
chisq3 <- table(Absent2.cat$ID, Absent2.cat$Day.of.the.week.nom)
chisq.test(chisq3)
##
## Pearson's Chi-squared test
##
## data: chisq3
## X-squared = 219.87, df = 128, p-value = 8.105e-07
#ID and Day.of.the.week.nom are dependent
chisq4 <- table(Absent2.cat$ID, Absent2.cat$Month.of.absence.nom)
chisq.test(chisq4)
##
## Pearson's Chi-squared test
##
## data: chisq4
## X-squared = NaN, df = 384, p-value = NA
#ID and Month.of.absence.nom are dependent
chisq5 <- table(Absent2.cat$ID, Absent2.cat$Seasons.nom)
chisq.test(chisq5)
##
## Pearson's Chi-squared test
##
## data: chisq5
## X-squared = 190.31, df = 96, p-value = 3.433e-08
#ID and Seasons.nom are dependent
chisq6 <- table(Absent2.cat$ID, Absent2.cat$Bad.habits)
chisq.test(chisq6)
##
## Pearson's Chi-squared test
##
## data: chisq6
## X-squared = 2088, df = 96, p-value < 2.2e-16
#ID and Bad.habits are dependent
chisq7 <- table(Absent2.cat$Reason.for.absence.short, Absent2.cat$Education)
chisq.test(chisq7)
##
## Pearson's Chi-squared test
##
## data: chisq7
## X-squared = 89.323, df = 18, p-value = 1.91e-11
#Reason.for.absence.short and Education are dependent
chisq8 <- table(Absent2.cat$Reason.for.absence.short, Absent2.cat$Day.of.the.week.nom)
chisq.test(chisq8)
##
## Pearson's Chi-squared test
##
## data: chisq8
## X-squared = 64.63, df = 36, p-value = 0.002366
#Reason.for.absence.short and Day.of.the.week.nom are dependent
chisq9 <- table(Absent2.cat$Reason.for.absence.short, Absent2.cat$Month.of.absence.nom)
chisq.test(chisq9)
##
## Pearson's Chi-squared test
##
## data: chisq9
## X-squared = NaN, df = 108, p-value = NA
#Reason.for.absence.short and Month.of.absence.nom are dependent
chisq10 <- table(Absent2.cat$Reason.for.absence.short, Absent2.cat$Seasons.nom)
chisq.test(chisq10)
##
## Pearson's Chi-squared test
##
## data: chisq10
## X-squared = 171.27, df = 27, p-value < 2.2e-16
#Reason.for.absence.short and Seasons.nom are dependent
chisq11 <- table(Absent2.cat$Reason.for.absence.short, Absent2.cat$Bad.habits)
chisq.test(chisq11)
##
## Pearson's Chi-squared test
##
## data: chisq11
## X-squared = 150.8, df = 27, p-value < 2.2e-16
#Reason.for.absence.short and Bad.habits are dependent
chisq12 <- table(Absent2.cat$Education, Absent2.cat$Day.of.the.week.nom)
chisq.test(chisq12)
##
## Pearson's Chi-squared test
##
## data: chisq12
## X-squared = 6.7051, df = 8, p-value = 0.5688
#Education and Day of the week.nom are independent
chisq13 <- table(Absent2.cat$Education, Absent2.cat$Month.of.absence.nom)
chisq.test(chisq13)
##
## Pearson's Chi-squared test
##
## data: chisq13
## X-squared = NaN, df = 24, p-value = NA
#Education and Month.of.absence.nom are independent
chisq14 <- table(Absent2.cat$Education, Absent2.cat$Seasons.nom)
chisq.test(chisq14)
##
## Pearson's Chi-squared test
##
## data: chisq14
## X-squared = 10.831, df = 6, p-value = 0.09373
#Education and Seasons.nom are independent
chisq15 <- table(Absent2.cat$Education, Absent2.cat$Bad.habits)
chisq.test(chisq15)
##
## Pearson's Chi-squared test
##
## data: chisq15
## X-squared = 331.72, df = 6, p-value < 2.2e-16
#Education and Bad.habits are dependent
chisq16 <- table(Absent2.cat$Day.of.the.week.nom, Absent2.cat$Month.of.absence.nom)
chisq.test(chisq16)
##
## Pearson's Chi-squared test
##
## data: chisq16
## X-squared = NaN, df = 48, p-value = NA
#Day.of.the.week.nom and Month.of.absence.nom are independent
chisq17 <- table(Absent2.cat$Day.of.the.week.nom, Absent2.cat$Seasons.nom)
chisq.test(chisq17)
##
## Pearson's Chi-squared test
##
## data: chisq17
## X-squared = 12.47, df = 12, p-value = 0.4087
#Day.of.the.week.nom and seasons.nom are independent
chisq18 <- table(Absent2.cat$Day.of.the.week.nom, Absent2.cat$Bad.habits)
chisq.test(chisq18)
##
## Pearson's Chi-squared test
##
## data: chisq18
## X-squared = 10.663, df = 12, p-value = 0.558
#Day.of.the.week.nom and Bad.habits are independent
chisq19 <- table(Absent2.cat$Month.of.absence.nom, Absent2.cat$Seasons.nom)
chisq.test(chisq19)
##
## Pearson's Chi-squared test
##
## data: chisq19
## X-squared = NaN, df = 36, p-value = NA
#Month.of.absence.nom and Season.nom are dependent
chisq20 <- table(Absent2.cat$Month.of.absence.nom, Absent2.cat$Bad.habits)
chisq.test(chisq20)
##
## Pearson's Chi-squared test
##
## data: chisq20
## X-squared = NaN, df = 36, p-value = NA
#Month.of.absence.nom and Bad.habits are dependent
chisq21 <- table(Absent2.cat$Seasons.nom, Absent2.cat$Bad.habits)
chisq.test(chisq21)
##
## Pearson's Chi-squared test
##
## data: chisq21
## X-squared = 15.938, df = 9, p-value = 0.06819
#Seasons.nom and Bad.habits are independent.
|
|
|
ID
|
Reason.for.absence.short
|
Education
|
Day.of.the.week.nom
|
Month.of.absence.nom
|
Seasons.nom
|
Bad.habits
|
|
|
|
|
|
|
|
|
|
|
|
ID
|
|
|
|
|
|
|
|
|
|
Reason.for.absence.short
|
|
Dependent
|
|
|
|
|
|
|
|
Education
|
|
Dependent
|
Dependent
|
|
|
|
|
|
|
Day.of.the.week.nom
|
|
Dependent
|
Dependent
|
Independent
|
|
|
|
|
|
Month.of.absence.nom
|
|
Dependent
|
Dependent
|
Independent
|
Independent
|
|
|
|
|
Seasons.nom
|
|
Dependent
|
Dependent
|
Independent
|
Independent
|
Dependent
|
|
|
|
Bad.habits
|
|
Dependent
|
Dependent
|
Dependent
|
Independent
|
Dependent
|
Independent
|
|
Multiple Correspondence Analysis
Discretized the continuous variables
Certain basic principles are generally followed for clustering or MCA, regardless of the discretization method used [@Data-Mining-and-Statistics-for-Decision-Making]:
library(car)
Absent2$Body.mass.cat<-Recode(Absent2$BMI,"19.15:24.99='Normal';25:29.99='High';30:38.01='Obese'")
table (Absent2$Body.mass.cat)
##
## High Normal Obese
## 123 349 224
- Transportation.expense.disc
We discretized the variable by having a equal frequency between the following four levels: + Between 118 and 178 reais + Between 179 and 224 reais + Between 225 and 259 reais + Between 260 and 388 reais
table (Absent2$Transportation.expense)
##
## 118 155 157 179 184 189 225 228 233 235 246 248 260 268 279 289 291 300
## 84 29 6 178 7 8 79 8 7 50 29 23 39 2 4 43 37 5
## 330 361 369 378 388
## 14 24 13 5 2
require('arules')
Absent2$Transportation.expense.disc <- discretize (Absent2$Transportation.expense, method = "frequency", breaks=4)
table(Absent2$Transportation.expense.disc)
##
## [118,179) [179,225) [225,260) [260,388]
## 119 193 196 188
We discretized the variable to have 4 levels: + No pet + 1 pet + 2 pets + more than 4 pets
table (Absent2$Pet)
##
## 0 1 2 4 5 8
## 433 131 92 29 4 7
require('car')
#having no pet, 1 pet, 2 pets and more than 4.
Absent2$Pet.disc<-Recode(Absent2$Pet,"0='No pet';1='One pet';2='Two pets';4:8='More than 4'")
table(Absent2$Pet.disc)
##
## More than 4 No pet One pet Two pets
## 40 433 131 92
Freq.failure.disc
- 0: no failure
- 1
2-6
hist(Absent2$Freq.failure)

Absent2$Freq.failure.disc <- discretize(Absent2$Freq.failure, breaks=3, method="frequency")
table(Absent2$Freq.failure.disc)
##
## [0,1) [1,2) [2,6]
## 229 198 269
#Distance.from.Residence.to.Work
Absent2$Distance.from.Residence.to.Work <- as.numeric(Absent2$Distance.from.Residence.to.Work)
Absent2$Distance.from.Residence.to.Work.disc <- discretize(Absent2$Distance.from.Residence.to.Work, breaks=4, method="frequency")
table(Absent2$Distance.from.Residence.to.Work.disc)
##
## [5,16) [16,26) [26,50) [50,52]
## 155 123 234 184
Service.time.disc
- 1-8 years
- 9-10 years
- 11-13 years
- 14-17 years
18-29 years
#Service.time
Absent2$Service.time <- as.numeric(Absent2$Service.time)
Absent2$Service.time.disc <- discretize(Absent2$Service.time, breaks=5, method="frequency")
table(Absent2$Service.time.disc)
##
## [1,9) [9,11) [11,14) [14,18) [18,29]
## 70 177 163 139 147
#Work.load.Average.day
Absent2$Hour.Work.load.Average.day.disc <- discretize(Absent2$Hour.Work.load.Average.day, breaks=5, method="frequency")
table(Absent2$Hour.Work.load.Average.day.disc)
##
## [3.43,3.99) [3.99,4.22) [4.22,4.48) [4.48,5.11) [5.11,6.31]
## 139 125 146 143 143
Hit.target.disc
- 87-91%
- 92-93%
- 94-95%
- 96-97%
98-100%
#Hit target
Absent2$Hit.target.disc <- discretize(Absent2$Hit.target, breaks=5, method="frequency")
table(Absent2$Hit.target.disc)
##
## [81,92) [92,94) [94,96) [96,98) [98,100]
## 92 171 104 157 172
Freq.absence.disc
- 2-22 absences
- 23-37 absences
- 38-74 absences
75-112 absences
#Freq.absence
Absent2$Freq.absence.disc <- discretize(Absent2$Freq.absence, breaks=4, method="frequency")
table (Absent2$Freq.absence.disc)
##
## [2,23) [23,38) [38,75) [75,112]
## 164 168 177 187
Son.disc
- 0 child
- 1 child
- 2 children
more than 3 children
#Son
table (Absent2$Son)
##
## 0 1 2 3 4
## 289 209 146 13 39
Absent2$Son.disc<-Recode(Absent2$Son,"0='No child';1='One child';2='Two children';3:4='More than 3 children'")
table (Absent2$Son.disc)
##
## More than 3 children No child One child
## 52 289 209
## Two children
## 146
First.start.disc
- 19 years old
- 20-24 years old
25-42 years old
#First.start.disc
Absent2$First.start.disc <- discretize(Absent2$First.start, breaks=3, method="frequency")
table (Absent2$First.start.disc)
##
## [19,20) [20,25) [25,42]
## 88 321 287
#Absent2$Absenteeism.time.in.hours
#Absenteeism.time.in.hours.disc
Absent2$Absenteeism.time.in.hours.disc <- discretize(Absent2$Absenteeism.time.in.hours, breaks=4, method="frequency")
table (Absent2$Absenteeism.time.in.hours.disc)
##
## [1,2) [2,3) [3,8) [8,120]
## 88 157 180 271
Subset the dataset
MCAdata <- subset(Absent2, select=-c(ID,Transportation.expense, Distance.from.Residence.to.Work, Service.time, Age, Hit.target, Son, Pet, Weight, Height, Absenteeism.time.in.hours, BMI, Freq.absence, Freq.failure, First.start, Hour.Work.load.Average.day, Number.of.days.absent))
library(FactoMineR)
#str(MCAdata)
MCAdata$Body.mass.cat <- as.factor(MCAdata$Body.mass.cat)
MCAdata$Pet.disc <- as.factor(MCAdata$Pet.disc)
MCAdata$Son.disc <- as.factor(MCAdata$Son.disc)
ALL DATA: Number of dimensions
We decide to discretize all variables and explore this analysis using all variables.
- Number of levels Education:3 Day.of.the.week:5 Month.of.absence:12 Seasons:4 Bad.habits:4 Reason.for.absence:10 Body.mass.cat:3 Transportation.expense:4 Pet.disc:4 Freq.failure:3 Distance:4 Service.time:5 Hour.Work.load.Average.day.disc:5 Hit.target:5 Freq.absence.disc:4 Son.disc:4 First.start.disc:3 Absenteeism.time.in.hours:4
Maximum dimensions: 3+5+12+4+4+10+3+4+4+3+4+5+5+5+4+4+3+4=86 dimensions 86-18=68 dimensions (1/dimensions)*100=1.47 => 25 dimensions
library(FactoMineR)
res.mca <- MCA (MCAdata, ncp=68, graph=TRUE)



library(factoextra)
eig.val <- get_eigenvalue(res.mca)
res.mca <- MCA (MCAdata, ncp=25, graph=TRUE) #18 dimensions



fviz_screeplot(res.mca, addlabels=TRUE)

11 variables: Number of dimensions
MCAdata2 <- subset(Absent2, select=-c(Month.of.absence.nom,Transportation.expense.disc,Distance.from.Residence.to.Work.disc, Service.time.disc, Hour.Work.load.Average.day.disc,Hit.target.disc, First.start.disc,ID,Transportation.expense, Distance.from.Residence.to.Work, Service.time, Age, Hit.target, Son, Pet, Weight, Height, Absenteeism.time.in.hours, BMI, Freq.absence, Freq.failure, First.start, Hour.Work.load.Average.day, Number.of.days.absent))
MCAdata2$Body.mass.cat <- as.factor(MCAdata2$Body.mass.cat)
MCAdata2$Pet.disc <- as.factor(MCAdata2$Pet.disc)
MCAdata2$Son.disc <- as.factor(MCAdata2$Son.disc)
str(MCAdata2)
## 'data.frame': 696 obs. of 11 variables:
## $ Education : Ord.factor w/ 3 levels "High School"<..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Reason.for.absence.short : chr "Medical consultation" "Accompanying person" "Diseases" "Diseases" ...
## $ Day.of.the.week.nom : Factor w/ 5 levels "Mon","Tue","Wed",..: 2 5 2 5 2 1 2 1 1 1 ...
## $ Seasons.nom : Factor w/ 4 levels "Winter","Summer",..: 1 3 2 3 1 2 1 3 1 4 ...
## $ Bad.habits : Factor w/ 4 levels "Both","Drinker",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Body.mass.cat : Factor w/ 3 levels "High","Normal",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Pet.disc : Factor w/ 4 levels "More than 4",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Freq.failure.disc : Factor w/ 3 levels "[0,1)","[1,2)",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Freq.absence.disc : Factor w/ 4 levels "[2,23)","[23,38)",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Son.disc : Factor w/ 4 levels "More than 3 children",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Absenteeism.time.in.hours.disc: Factor w/ 4 levels "[1,2)","[2,3)",..: 1 4 3 4 3 4 3 4 4 3 ...
Number of levels: Education:3 Day.of.week:5 Season:4 Bad.habit:4 Reasons:10 Body.mass:3 Pet:4 Freq.failure:4 Freq.absence:4 Son:4 Absenteeism.time:4
Number of dimensions 3+5+4+4+10+3+4+3+4+4+4=48 48-11=37 Eigenvalue: (1/37)*100=2.7027% => 15 dimensions
res.mca <- MCA (MCAdata2, ncp=37, graph=TRUE)



res.mca$eig
## eigenvalue percentage of variance
## dim 1 0.280961128 8.3528984
## dim 2 0.249268289 7.4106789
## dim 3 0.216316205 6.4310223
## dim 4 0.184973817 5.4992216
## dim 5 0.170691890 5.0746237
## dim 6 0.145190435 4.3164724
## dim 7 0.137941149 4.1009531
## dim 8 0.130425448 3.8775133
## dim 9 0.123141142 3.6609529
## dim 10 0.112445959 3.3429880
## dim 11 0.106883177 3.1776080
## dim 12 0.100379707 2.9842615
## dim 13 0.093339054 2.7749449
## dim 14 0.092861961 2.7607610
## dim 15 0.090619146 2.6940827
## dim 16 0.089465141 2.6597745
## dim 17 0.087479274 2.6007352
## dim 18 0.085542247 2.5431479
## dim 19 0.080427829 2.3910976
## dim 20 0.079410361 2.3608486
## dim 21 0.077842819 2.3142460
## dim 22 0.071976827 2.1398516
## dim 23 0.070531139 2.0968717
## dim 24 0.068739508 2.0436070
## dim 25 0.065633706 1.9512723
## dim 26 0.059876747 1.7801195
## dim 27 0.052211779 1.5522421
## dim 28 0.043105609 1.2815181
## dim 29 0.038854305 1.1551280
## dim 30 0.034097592 1.0137122
## dim 31 0.028566063 0.8492613
## dim 32 0.025972172 0.7721457
## dim 33 0.023446140 0.6970474
## dim 34 0.018359498 0.5458229
## dim 35 0.011974674 0.3560038
## dim 36 0.008404296 0.2498575
## dim 37 0.006280131 0.1867066
## cumulative percentage of variance
## dim 1 8.352898
## dim 2 15.763577
## dim 3 22.194600
## dim 4 27.693821
## dim 5 32.768445
## dim 6 37.084917
## dim 7 41.185870
## dim 8 45.063384
## dim 9 48.724337
## dim 10 52.067325
## dim 11 55.244932
## dim 12 58.229194
## dim 13 61.004139
## dim 14 63.764900
## dim 15 66.458983
## dim 16 69.118757
## dim 17 71.719492
## dim 18 74.262640
## dim 19 76.653738
## dim 20 79.014586
## dim 21 81.328832
## dim 22 83.468684
## dim 23 85.565556
## dim 24 87.609163
## dim 25 89.560435
## dim 26 91.340554
## dim 27 92.892797
## dim 28 94.174315
## dim 29 95.329443
## dim 30 96.343155
## dim 31 97.192416
## dim 32 97.964562
## dim 33 98.661609
## dim 34 99.207432
## dim 35 99.563436
## dim 36 99.813293
## dim 37 100.000000
library(factoextra)
eig.val <- get_eigenvalue(res.mca)
#15 dimensions
res.mca <- MCA (MCAdata2, ncp=15, graph=TRUE)



#Data visualization
library(factoextra)
fviz_mca_var(res.mca)

get.cos2 <- get_mca_var(res.mca)$cos2
fviz_contrib(res.mca, choice="var", top=20)

fviz_screeplot(res.mca, addlabels=TRUE)

fviz_cos2(res.mca, choice="var", top=20, axes=1:2)

#summary(res.mca)
fviz_mca_var(res.mca, choice="mca.cor", repel=TRUE, ggtheme=theme_minimal())

#fviz_mca_var(res.mca, repel=TRUE, ggtheme=theme_minimal())
#fviz_mca_var(res.mca)
fviz_mca_var(res.mca, choice="var.cat", col.var="black")

#per observation
ind <- get_mca_ind(res.mca)
#ind
#fviz_mca_ind(res.mca, col.var="cos2", gradient.cols=c("#32CD32", "#FFD700", "#FF0000"), repel=TRUE, ggtheme=theme_minimal())
#cos2
fviz_cos2(res.mca, choice="ind", axes=1:2, top=20)

#contribution
fviz_contrib(res.mca, choice="ind", axes=1:2, top=20)

fviz_mca_ind(res.mca, label="none", habillage = "Seasons.nom", addEllipses = TRUE, ellipse.type="confidence", ggtheme=theme_minimal())

fviz_ellipses(res.mca, c("Seasons.nom","Day.of.the.week.nom","Bad.habits", "Education", "Body.mass.cat", "Pet.disc", "Son.disc", "Absenteeism.time.in.hours.disc", "Reason.for.absence.short"), geom="point")

#dimension description
res.desc <- dimdesc (res.mca, axes= c(1,2))
#res.desc[[1]]
#Graphs for the poster:
#change names
colnames(MCAdata2) <- c('Education','Week day', 'Seasons', 'Bad habits', 'Reasons', 'BMI', 'Pet', 'Failure frequency', 'Absence frequency', 'Children', 'Absenteeism time')
res.mca <- MCA (MCAdata2, ncp=15, graph=TRUE)



library(ggsci)
#fviz_ellipses(res.mca, axes = c(1, 2), ellipse.type = "confidence" , ggtheme = theme_bw(), c("Seasons.nom","Day.of.the.week.nom","Bad.habits", "Education", "Body.mass.cat", "Pet.disc", "Son.disc", "Absenteeism.time.in.hours", "Reason.for.absence.short"), geom="point")
fviz_ellipses(res.mca,repel=TRUE, axes=c(1,2), ellipse.type = "confidence", c('Education','Week day', 'Seasons', 'Bad habits', 'Reasons', 'BMI', 'Pet', 'Failure frequency', 'Absence frequency', 'Children', 'Absenteeism time'),geom="point")

#fviz_ellipses(res.mca,repel=TRUE, axes=c(1,2),ggtheme=theme_minimal,ellipse.type = "confidence", c('Education','Week day', 'Seasons', 'Bad habits', 'Reasons', 'BMI', 'Pet', 'Failure frequency', 'Absence frequency', 'Children', 'Absenteeism time'),geom="point")
Making the Hierarchical clustering
library("NbClust")
# Elbow method
fviz_nbclust(res.mca$ind$coord, hcut, method = "wss", diss=get_dist(res.mca$ind$coord, method="spearman")) +
#geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")

# Silhouette method
fviz_nbclust(res.mca$ind$coord, hcut, method = "silhouette", diss=get_dist(res.mca$ind$coord, method="spearman") )+
labs(subtitle = "Silhouette method")
The Elbow and Sillouette suggest 4 clusters.
res.hcpc = HCPC(res.mca,nb.clust=4)



fviz_cluster(res.hcpc, repel=TRUE, show.clust.cent = TRUE, palette="NULL", ggtheme=theme_minimal(), geom="point")

Just categoricals: Number of dimensions
MCAdata3 <- subset(Absent2, select=c(Education, Day.of.the.week.nom, Seasons.nom, Bad.habits, Reason.for.absence.short, Body.mass.cat))
MCAdata3$Body.mass.cat <- as.factor(MCAdata3$Body.mass.cat)
str(MCAdata3)
## 'data.frame': 696 obs. of 6 variables:
## $ Education : Ord.factor w/ 3 levels "High School"<..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Day.of.the.week.nom : Factor w/ 5 levels "Mon","Tue","Wed",..: 2 5 2 5 2 1 2 1 1 1 ...
## $ Seasons.nom : Factor w/ 4 levels "Winter","Summer",..: 1 3 2 3 1 2 1 3 1 4 ...
## $ Bad.habits : Factor w/ 4 levels "Both","Drinker",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Reason.for.absence.short: chr "Medical consultation" "Accompanying person" "Diseases" "Diseases" ...
## $ Body.mass.cat : Factor w/ 3 levels "High","Normal",..: 1 1 1 1 1 1 1 1 1 1 ...
Education:3 Day.of.week:5 Season:4 Bad.habit:4 Reasons:10 Body.mass:3
- Number of dimensions 3+5+4+4+10+3=29 29-6=23 Eigenvalue: (1/23)*100=4.348% => 10 dimensions
res.mca <- MCA (MCAdata3, ncp=23, graph=TRUE)



res.mca$eig
## eigenvalue percentage of variance cumulative percentage of variance
## dim 1 0.33537176 8.748828 8.748828
## dim 2 0.28397264 7.407982 16.156810
## dim 3 0.24881990 6.490954 22.647764
## dim 4 0.22164642 5.782081 28.429845
## dim 5 0.21406560 5.584320 34.014165
## dim 6 0.20035265 5.226591 39.240756
## dim 7 0.19748501 5.151783 44.392539
## dim 8 0.18371196 4.792486 49.185025
## dim 9 0.17630737 4.599323 53.784347
## dim 10 0.17165924 4.478067 58.262414
## dim 11 0.16326747 4.259151 62.521566
## dim 12 0.15946743 4.160020 66.681586
## dim 13 0.15313608 3.994854 70.676440
## dim 14 0.15167740 3.956802 74.633241
## dim 15 0.14747064 3.847060 78.480302
## dim 16 0.13834733 3.609061 82.089362
## dim 17 0.13705267 3.575287 85.664649
## dim 18 0.12404581 3.235978 88.900627
## dim 19 0.11746429 3.064286 91.964913
## dim 20 0.09429685 2.459918 94.424831
## dim 21 0.08018139 2.091688 96.516519
## dim 22 0.07271273 1.896854 98.413373
## dim 23 0.06082070 1.586627 100.000000
library(factoextra)
eig.val <- get_eigenvalue(res.mca)
#10 dimensions
res.mca <- MCA (MCAdata3, ncp=10, graph=TRUE)



Same result as using the previous 11 variables.
Principal component, Factor and Cluster Analysis
Reasons and goals of the analysis
We use the methods to explore whether previously clusters may exist in the data set. We used in this analysis the original numerical variables we have in our disposal. The choice of considering only numerical ones follows from the fact that we want to perform PCA before, and use its output as input for the clustering methods. It could be possible perform cluster analysis using mixed variables with packages that performs the distance also between categorical variables but since our goal is to use the information of the PCA we didn’t choose this road.
We standardize before because of the different scales of measurements: the goal is to make the variables comparable. We will not consider Work.load.Average.day but Hour.Work.load.Average.day, because the description of it could be simpler.
We will consider the cleaned data set. In fact, the goal of this section is to find clusters which describe the observations regarding the absenteeism hours. For example: Cluster1, 5 hours: tot sons, tot pets, tot failures ecc…
In conclusion, the variables selected are: Freq.failure, Transportation.expense, Distance.from.Residence.to.Work, Service.Time, Age, Hit.target, Son, Pet, Weight, Height, BMI, Freq.absence, Hour.work.load.Average.day, Number.of.fays.absent,First.Start (15 variables).
Absenteeism_Clustering=Absent_outliers
#str(Absenteeism_Clustering)
AbsenteeismCont=subset(Absenteeism_Clustering, select=-c(ID, Month.of.absence.nom, Seasons.nom, Education, Bad.habits, Day.of.the.week.nom, Reason.for.absence.short))
#str(AbsenteeismCont)
AbsenteeismCont_Norm=scale(AbsenteeismCont, center=TRUE, scale=TRUE)
Pre-selection
AbsenteeismCont_Norm_presel <- subset(AbsenteeismCont_Norm, select=-c(Weight,Number.of.days.absent, Age, Absenteeism.time.in.hours))#WITHOUT aBSENTEEISM.TIME.IN.HOURS
AbsenteeismCont_Norm_presel=as.data.frame(AbsenteeismCont_Norm_presel)
str(AbsenteeismCont_Norm_presel)
## 'data.frame': 667 obs. of 12 variables:
## $ Transportation.expense : num 0.228 0.228 0.228 0.228 0.228 ...
## $ Distance.from.Residence.to.Work: num -1.27 -1.27 -1.27 -1.27 -1.27 ...
## $ Service.time : num 0.338 0.338 0.338 0.338 0.338 ...
## $ Hit.target : num -0.329 -1.307 -0.655 -0.329 -0.655 ...
## $ Son : num 0.00684 0.00684 0.00684 0.00684 0.00684 ...
## $ Pet : num 0.197 0.197 0.197 0.197 0.197 ...
## $ Height : num -0.0151 -0.0151 -0.0151 -0.0151 -0.0151 ...
## $ BMI : num 0.762 0.762 0.762 0.762 0.762 ...
## $ Freq.absence : num -0.742 -0.742 -0.742 -0.742 -0.742 ...
## $ Freq.failure : num -0.228 -0.228 -0.228 -0.228 -0.228 ...
## $ Hour.Work.load.Average.day : num -0.13 -0.621 -0.867 2.715 -0.532 ...
## $ First.start : num -0.134 -0.134 -0.134 -0.134 -0.134 ...
Principal Component Analysis
Now, the PCA using these 12 variables.
To perform PCA we use first the command “princomp” and after, we compare the results of the command “PCA” [FactoMineR package].
xnorm.pca <- princomp(AbsenteeismCont_Norm_presel,cor=TRUE, scores = TRUE) #scores=TRUE
summary(xnorm.pca) #to compare with PCA command
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.5928365 1.3814672 1.3314686 1.0625976 1.00492425
## Proportion of Variance 0.2114273 0.1590376 0.1477341 0.0940928 0.08415606
## Cumulative Proportion 0.2114273 0.3704650 0.5181990 0.6122918 0.69644790
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.94692479 0.88484616 0.80775888 0.70283072
## Proportion of Variance 0.07472221 0.06524606 0.05437287 0.04116425
## Cumulative Proportion 0.77117012 0.83641618 0.89078905 0.93195330
## Comp.10 Comp.11 Comp.12
## Standard deviation 0.6196938 0.52848517 0.39146317
## Proportion of Variance 0.0320017 0.02327471 0.01277028
## Cumulative Proportion 0.9639550 0.98722972 1.00000000
#considering the % of variance explained, screeplot
#plot(xnorm.pca, type = "l", main="Elbow plot")
It seems there is a Elbow in Component 4. But considering the % of explained variance and the value of the eigenvalues:
library("FactoMineR")
res.pca=PCA(AbsenteeismCont_Norm_presel, ncp = 12, graph = FALSE)
library("factoextra")
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))

And in particular:
eig.val <- get_eigenvalue(res.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.5371282 21.142735 21.14273
## Dim.2 1.9084516 15.903763 37.04650
## Dim.3 1.7728087 14.773406 51.81990
## Dim.4 1.1291136 9.409280 61.22918
## Dim.5 1.0098728 8.415606 69.64479
## Dim.6 0.8966666 7.472221 77.11701
## Dim.7 0.7829527 6.524606 83.64162
## Dim.8 0.6524744 5.437287 89.07890
## Dim.9 0.4939710 4.116425 93.19533
## Dim.10 0.3840204 3.200170 96.39550
## Dim.11 0.2792966 2.327471 98.72297
## Dim.12 0.1532434 1.277028 100.00000
Comparing princcomp and PCA have the same results, as we were expecting.
Considering that an eigenvalue > 1 indicates that PCs account for more variance than accounted by one of the original variables in standardized data. This is commonly used as a cutoff point for which PCs are retained. This holds true only when the data are standardized, as our case.
So considering this the number as cutoff, the number of dimensions should be 5, but the explained variance is only 70%. But considering until dimension 9 can be better for the variance, also if the scree plot shows an elbow in 4, but the variance at 4 is around 61.
We believe that consider 9 variables could be a good idea, but let’s see the contributions to be sure.
#code to save the scores that we will need later in the clustering analysis
mat=xnorm.pca$scores
#dim(mat)
matcomp9=mat[, 1:9]
xnorm.pca$loadings #results princcomp command. NB: the compenents are standardized! we can consider the loading relatively to the component but not absolutely. it is normal that SS loadings is equal to 1
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## Transportation.expense 0.378 0.425
## Distance.from.Residence.to.Work -0.191 0.586 -0.102 -0.118
## Service.time -0.425 0.349 -0.193
## Hit.target -0.915 0.320
## Son 0.320 0.151 0.278 -0.464 -0.216 -0.323
## Pet 0.321 0.351 -0.157 0.227 0.448
## Height -0.452 -0.110 -0.199 -0.115
## BMI -0.293 0.129 0.448 0.187 0.352
## Freq.absence -0.522 0.224 -0.168
## Freq.failure 0.118 0.555 -0.206
## Hour.Work.load.Average.day -0.115 -0.627 0.279 0.637
## First.start 0.243 -0.160 0.449 0.403 0.229
## Comp.7 Comp.8 Comp.9 Comp.10 Comp.11
## Transportation.expense -0.141 0.190 0.641 -0.153
## Distance.from.Residence.to.Work -0.119 0.160 -0.249 -0.575
## Service.time 0.489 -0.601
## Hit.target 0.167 0.104
## Son 0.145 -0.421 0.314 0.350
## Pet -0.257 -0.392 0.343 -0.376
## Height -0.758 -0.328
## BMI -0.379 0.221 0.291 0.451
## Freq.absence -0.290 -0.182 0.242
## Freq.failure -0.720 -0.145 -0.284
## Hour.Work.load.Average.day 0.287
## First.start 0.237 0.211 -0.297 -0.451 0.145
## Comp.12
## Transportation.expense 0.416
## Distance.from.Residence.to.Work -0.391
## Service.time 0.183
## Hit.target
## Son
## Pet
## Height 0.128
## BMI -0.209
## Freq.absence 0.686
## Freq.failure
## Hour.Work.load.Average.day
## First.start 0.294
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.083 0.083 0.083 0.083 0.083 0.083 0.083 0.083
## Cumulative Var 0.083 0.167 0.250 0.333 0.417 0.500 0.583 0.667
## Comp.9 Comp.10 Comp.11 Comp.12
## SS loadings 1.000 1.000 1.000 1.000
## Proportion Var 0.083 0.083 0.083 0.083
## Cumulative Var 0.750 0.833 0.917 1.000
Considering the loadings and its interpretation, looking to the histograms contribution:
# Contributions of variables to PC1
fviz_contrib(res.pca, choice = "var", axes = 1, top = 12)

# Contributions of variables to PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 12)

fviz_contrib(res.pca, choice = "var", axes = 3, top = 13)

fviz_contrib(res.pca, choice = "var", axes = 4, top = 12)

fviz_contrib(res.pca, choice = "var", axes = 5, top = 12)

fviz_contrib(res.pca, choice = "var", axes = 6, top = 12)

fviz_contrib(res.pca, choice = "var", axes = 7, top = 12)

fviz_contrib(res.pca, choice = "var", axes = 8, top = 12)

fviz_contrib(res.pca, choice = "var", axes = 9, top = 12)

fviz_contrib(res.pca, choice = "var", axes = 10, top = 12)

fviz_contrib(res.pca, choice = "var", axes = 11, top = 12)

fviz_contrib(res.pca, choice = "var", axes = 12, top = 12)

Component 1: -0.522 Freq.failure, -0.425 Service.time
Component2: 0.586 Distance.from.Residence.to.work, -0.452 Height
Component3: 0.555 Freq.failure, 0.449 First.start, 0.448 BMI
Component4: -0.627 Hour.Work.load.Average.day, -0.464 Son
Component5: -0.915 Hit.target
Component6: 0.637 Hour.Work.load.Average.day
Component7: -0.758 Height
Component8: -0.720 Freq.failure
Component9: -0.641 Transportation.expense
Component10: 0.575 Distance.from.residence.to work, 0.451 First.start
Component11: 0.601 Service time, -0.451 BMI
Component12: 0.686 Freq.absence
The components are confused.
Summarizing:
library("corrplot")
corrplot(res.pca$var$contrib, is.corr=FALSE)

Until component 5, as suggested from the value of the eigenvalue, there is a vast explanation. But, also the components from 6 to 9, as the %explained variance suggested, are very important regarding some variables. We expected to lose some information regarding Service.time and Freq.absence since they are contributing in the component 11 and component 12. Since the goal is not only to consider orthogonal variables, but also to reduce (from 12 to 9 is not a big reduction but still) we will consider only the 9 components.
The total contribution of the first 9 principal components is:
fviz_contrib(res.pca, choice = "var", axes = 1:9, top = 12)

Also if components not considered better explained some variables, overall the total contribution seems to consider well all the variables! We can see the most contribute variable is “hit target”, followed by “Hour.work.load.average.day” and “Freq.failure”.
More analysis of PCA results considering cos2:
library("corrplot")
corrplot(res.pca$var$cos2, is.corr=FALSE)

# Total cos2 of variables on Dim.1/ dim 9
fviz_cos2(res.pca, choice = "var", axes = 1:9)

Note that, a high cos2 indicates a good representation of the variable on the principal component. In this case the variable is positioned close to the circumference of the correlation circle. A low cos2 indicates that the variable is not perfectly represented by the PCs. In this case the variable is close to the center of the circle.
The cos2 values are used to estimate the quality of the representation The closer a variable is to the circle of correlations, the better its representation on the factor map (and the more important it is to interpret these components) Variables that are closed to the center of the plot are less important for the first components. variables with low cos2 values will be colored in “white” variables with mid cos2 values will be colored in “blue” variables with high cos2 values will be colored in red but we will considered two components, so remember to pay attention to not interpret these results are general.
# Color by cos2 values: quality on the factor map
fviz_pca_var(res.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)

Factor Analysis
Since the interpretation of the loadings of the principal components are confused, we decide to perform factor analysis. In fact, at the end of it, it is possible to rotate with the command “varmax” to have a clear interpretation. As we saw in the classes, to extract the factors we will use principal component factoring method. As principal component analysis, we will use the continuous and pre-selected variables; like this, we could compare principal components and factors and decide which ones to use in the future.
#str(AbsenteeismCont_Norm_presel)
R=cor(AbsenteeismCont_Norm_presel)
library(psych)
KMO(R)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA = 0.51
## MSA for each item =
## Transportation.expense Distance.from.Residence.to.Work
## 0.50 0.49
## Service.time Hit.target
## 0.55 0.53
## Son Pet
## 0.57 0.59
## Height BMI
## 0.54 0.47
## Freq.absence Freq.failure
## 0.49 0.51
## Hour.Work.load.Average.day First.start
## 0.47 0.45
Because of the theory, we know KMO 0.9 marvelous KMO 0.8 meritorious KMO 0.7 middling KMO 0.6 mediocre KMO 0.5 miserable KMO < 0.5 unacceptable. The overall MSA is 0.51 and some variables have <0.5. We will try to perform anyway but we know it is not the appropriate way.
As we knew already from PCA:
#eigen(R)
plot(eigen(R)$values, type="b")

perc_explained<-eigen(R)$values/12
cum_explain<-cumsum(perc_explained)
table<-cbind(eigenvalue=eigen(R)$values,perc_explained,cum_explain)
table
## eigenvalue perc_explained cum_explain
## [1,] 2.5371282 0.21142735 0.2114273
## [2,] 1.9084516 0.15903763 0.3704650
## [3,] 1.7728087 0.14773406 0.5181990
## [4,] 1.1291136 0.09409280 0.6122918
## [5,] 1.0098728 0.08415606 0.6964479
## [6,] 0.8966666 0.07472221 0.7711701
## [7,] 0.7829527 0.06524606 0.8364162
## [8,] 0.6524744 0.05437287 0.8907890
## [9,] 0.4939710 0.04116425 0.9319533
## [10,] 0.3840204 0.03200170 0.9639550
## [11,] 0.2792966 0.02327471 0.9872297
## [12,] 0.1532434 0.01277028 1.0000000
The number of unique parameters in R are (13x14)/2=78, and considering five factors we reduce to: 5x13+13=72. Considering m=9 factors, similar to PCA, doesn’t have sense.
(12*13)/2
## [1] 78
5*12+12
## [1] 72
9*12+12
## [1] 120
D<-matrix(rep(0,(12*12)),nrow=12)
diag(D)<-sqrt(eigen(R)$values)
loadings<-eigen(R)$vectors%*%D
rownames(loadings)<-names(AbsenteeismCont_Norm_presel)
loadings[,1:5]
## [,1] [,2] [,3]
## Transportation.expense -0.60234611 -0.58773272 0.04271993
## Distance.from.Residence.to.Work 0.30376039 -0.80972358 -0.13614060
## Service.time 0.67758318 0.07139490 0.46505791
## Hit.target 0.11153082 0.07821637 -0.12396238
## Son -0.50898328 -0.20883529 0.36978977
## Pet -0.51154369 -0.48475107 -0.20964448
## Height -0.14674814 0.62389333 -0.14628915
## BMI 0.46745384 -0.17873685 0.59599757
## Freq.absence 0.83169238 -0.30954402 -0.22374740
## Freq.failure -0.05715864 -0.16318038 0.73909977
## Hour.Work.load.Average.day -0.06683185 0.15861433 -0.10002834
## First.start -0.38761326 0.22082828 0.59762696
## [,4] [,5]
## Transportation.expense -0.100568278 -0.041592196
## Distance.from.Residence.to.Work -0.125183337 -0.091542658
## Service.time -0.205585122 -0.046720157
## Hit.target 0.009992964 -0.919409111
## Son -0.492679703 -0.217498311
## Pet 0.241005043 0.059034220
## Height -0.211144719 -0.115589259
## BMI 0.198394176 0.078031842
## Freq.absence -0.032089358 0.021218879
## Freq.failure -0.219158996 -0.052789370
## Hour.Work.load.Average.day -0.666241955 0.280680300
## First.start 0.428095658 -0.002847175
Communalities<-matrix(rep(0,13),nrow=13)
Communalities[1]<-sum(loadings[1,1:5]^2)
Communalities[2]<-sum(loadings[2,1:5]^2)
Communalities[3]<-sum(loadings[3,1:5]^2)
Communalities[4]<-sum(loadings[4,1:5]^2)
Communalities[5]<-sum(loadings[5,1:5]^2)
Communalities[6]<-sum(loadings[6,1:5]^2)
Communalities[7]<-sum(loadings[7,1:5]^2)
Communalities[8]<-sum(loadings[8,1:5]^2)
Communalities[9]<-sum(loadings[9,1:5]^2)
Communalities[10]<-sum(loadings[10,1:5]^2)
Communalities[11]<-sum(loadings[11,1:5]^2)
Communalities[12]<-sum(loadings[12,1:5]^2)
Communalities[13]<-sum(Communalities)
rownames(Communalities)<-c(names(AbsenteeismCont_Norm_presel),"total")
Communalities
## [,1]
## Transportation.expense 0.7219195
## Distance.from.Residence.to.Work 0.7905078
## Service.time 0.7249431
## Hit.target 0.8793366
## Son 0.7294594
## Pet 0.6021798
## Height 0.4901214
## BMI 0.6511223
## Freq.absence 0.8390726
## Freq.failure 0.6269808
## Hour.Work.load.Average.day 0.5622904
## First.start 0.7394412
## total 8.3573749
# Sum of communalities id equal to the sum of eigenvalues
sum(eigen(R)$values[1:5]) #total of communalities check
## [1] 8.357375
#Factor model
#R= L T(L) + FI
Psi<-matrix(rep(0,12*12),nrow=12)
diag(Psi)<-c(1,1,1,1,1,1,1,1,1,1,1,1)-Communalities[1:12]
#Psi
Residual_R<-R-loadings[,1:5]%*%t(loadings[,1:5])-Psi
#dimnames(Residual_R)<-names(AbsenteeismCont_Fact)
Residual_R #part not explained by the model
## Transportation.expense
## Transportation.expense -1.110223e-16
## Distance.from.Residence.to.Work -3.773108e-02
## Service.time 6.550986e-02
## Hit.target 3.734792e-03
## Son -1.201324e-01
## Pet -1.235969e-01
## Height 7.192462e-02
## BMI 8.401914e-02
## Freq.absence -5.366953e-02
## Freq.failure -6.774292e-02
## Hour.Work.load.Average.day -1.801692e-02
## First.start -5.321407e-02
## Distance.from.Residence.to.Work
## Transportation.expense -3.773108e-02
## Distance.from.Residence.to.Work 1.110223e-16
## Service.time 2.991124e-02
## Hit.target -3.874999e-02
## Son -2.974499e-03
## Pet -1.704748e-02
## Height 1.374349e-01
## BMI -2.485222e-02
## Freq.absence -4.348956e-02
## Freq.failure -5.313915e-02
## Hour.Work.load.Average.day 1.090233e-04
## First.start 1.228471e-01
## Service.time Hit.target Son
## Transportation.expense 0.06550986 0.003734792 -0.120132379
## Distance.from.Residence.to.Work 0.02991124 -0.038749993 -0.002974499
## Service.time 0.00000000 -0.029794139 0.028452908
## Hit.target -0.02979414 0.000000000 -0.113651557
## Son 0.02845291 -0.113651557 0.000000000
## Pet 0.06208614 0.082962578 -0.040141545
## Height 0.01601880 -0.087659025 -0.020501620
## BMI -0.03636557 0.070936798 -0.048804882
## Freq.absence -0.11231187 0.001185404 0.032380418
## Freq.failure -0.19386183 0.012861110 -0.133967665
## Hour.Work.load.Average.day -0.03443698 0.219884329 -0.191055165
## First.start 0.01821114 0.067686181 -0.029808326
## Pet Height BMI
## Transportation.expense -0.12359694 0.07192462 0.08401914
## Distance.from.Residence.to.Work -0.01704748 0.13743495 -0.02485222
## Service.time 0.06208614 0.01601880 -0.03636557
## Hit.target 0.08296258 -0.08765903 0.07093680
## Son -0.04014155 -0.02050162 -0.04880488
## Pet 0.00000000 0.16419043 0.15818349
## Height 0.16419043 0.00000000 0.20034581
## BMI 0.15818349 0.20034581 0.00000000
## Freq.absence 0.04045213 0.03093828 -0.02705039
## Freq.failure 0.03648570 0.03410160 -0.08891568
## Hour.Work.load.Average.day 0.17107833 -0.12166775 0.12979175
## First.start 0.01933313 -0.06014383 -0.06029037
## Freq.absence Freq.failure
## Transportation.expense -5.366953e-02 -0.06774292
## Distance.from.Residence.to.Work -4.348956e-02 -0.05313915
## Service.time -1.123119e-01 -0.19386183
## Hit.target 1.185404e-03 0.01286111
## Son 3.238042e-02 -0.13396767
## Pet 4.045213e-02 0.03648570
## Height 3.093828e-02 0.03410160
## BMI -2.705039e-02 -0.08891568
## Freq.absence -1.110223e-16 0.10544071
## Freq.failure 1.054407e-01 0.00000000
## Hour.Work.load.Average.day 7.981721e-03 -0.02893661
## First.start 4.146288e-02 -0.08771162
## Hour.Work.load.Average.day First.start
## Transportation.expense -0.0180169171 -5.321407e-02
## Distance.from.Residence.to.Work 0.0001090233 1.228471e-01
## Service.time -0.0344369777 1.821114e-02
## Hit.target 0.2198843293 6.768618e-02
## Son -0.1910551652 -2.980833e-02
## Pet 0.1710783279 1.933313e-02
## Height -0.1216677458 -6.014383e-02
## BMI 0.1297917523 -6.029037e-02
## Freq.absence 0.0079817209 4.146288e-02
## Freq.failure -0.0289366142 -8.771162e-02
## Hour.Work.load.Average.day 0.0000000000 2.047682e-01
## First.start 0.2047682465 -1.110223e-16
The residual matrix is close to 0, seems the part not explained by the model is small.
varimax(loadings[,1:5])
## $loadings
##
## Loadings:
## [,1] [,2] [,3] [,4] [,5]
## Transportation.expense -0.639 -0.248 0.499
## Distance.from.Residence.to.Work -0.877
## Service.time 0.817 -0.197 0.132
## Hit.target -0.932
## Son -0.202 0.800 -0.186 -0.104
## Pet -0.718 -0.207 0.170
## Height 0.582 -0.315 -0.192
## BMI 0.546 -0.278 0.171 0.457 0.193
## Freq.absence 0.447 -0.678 -0.407
## Freq.failure 0.288 0.704 0.186 0.112
## Hour.Work.load.Average.day 0.114 0.170 -0.685 0.212
## First.start 0.474 0.325 0.620 0.155
##
## [,1] [,2] [,3] [,4] [,5]
## SS loadings 2.238 2.030 1.751 1.280 1.058
## Proportion Var 0.187 0.169 0.146 0.107 0.088
## Cumulative Var 0.187 0.356 0.502 0.608 0.696
##
## $rotmat
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.786894819 -0.450712209 -0.4155760 -0.006701223 -0.07004800
## [2,] 0.349245398 0.880880154 -0.2787076 -0.135256006 -0.07813902
## [3,] 0.469926332 0.127947144 0.7076963 0.465571144 0.21262241
## [4,] -0.194735903 0.066994894 -0.4633279 0.860599406 0.04780725
## [5,] -0.008447833 0.007062055 -0.1846942 -0.155798201 0.97030601
Factor1: -0.817 Service.time, 0.718 Pet
Factor2: 0.877 Distance.from.Residence.to.Work, 0.678 Freq.absence
Factor3: 0.800 Son, 0.704 Freq.failure
Factor4: -0.685 Hour.Work.load.Average.day, 0.620 First.start
Factor5: -0.932 Hit.target
Since the interpretation is still confused and the cumulative var is only 0.656 and moreover, the KMO were not appropriate… we will use the principal components as inputs for our clustering.
Number of clusters
We will show the Elbow, Silhouette and Gap statistics methods.
library("NbClust")
library(FactoMineR)
library(factoextra)
# Elbow method
fviz_nbclust(matcomp9, hcut, method = "wss", diss=get_dist(matcomp9, method="spearman")) +
#geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")

It seems there is not an “elbow” point. Three and seven could be considered.
# Silhouette method
fviz_nbclust(matcomp9, hcut, method = "silhouette", diss=get_dist(matcomp9, method="spearman") )+
labs(subtitle = "Silhouette method")

Silhouette suggested at 10 (max number of clusters allowed), but there are some speaks also in 3 and 7 as Elbow suggested.
# Gap statistic
# nboot = 50 to keep the function speedy.
# recommended value: nboot= 500 for your analysis.
# Use verbose = FALSE to hide computing progression.
set.seed(123)
fviz_nbclust(matcomp9, hcut, nstart = 25, method = "gap_stat", nboot = 50, diss=get_dist(matcomp9, method="spearman"))+
labs(subtitle = "Gap statistic method")

Difficult to say something looking to the silhouette method.
From these three is difficult to conclude the right number: but or 3 and 7 are possible candidates.
It could be nice to use the command NbClust, but since the function of it has some problems when you give as input a dissimilarity matrix without a data matrix, we couldn’t (check it with the teacher!).
Grouping the hours
Because the goal is to control if the clustering groups well considering the absenteeism hours, in the way we can describe better the data set in groups, we need to group the hours. During the cluster analysis, we will consider if the clusters have a good classification regarding low and high hours. To group the hours, we remind the distribution of the absenteeism hours without outliers:
par(mfrow=c(1,2))
barplot(table(AbsenteeismCont$Absenteeism.time.in.hours), col = 'lavender')
boxplot(AbsenteeismCont$Absenteeism.time.in.hours)

#table(AbsenteeismCont$Absenteeism.time.in.hours)
#median(AbsenteeismCont$Absenteeism.time.in.hours) 3
#mean(AbsenteeismCont$Absenteeism.time.in.hours) 5.59
We can consider these 3 groups, as histogram suggests and as we considered for MCA.
for(i in 1:667){
if(AbsenteeismCont$Absenteeism.time.in.hours[i]<2) {AbsenteeismCont$Hoursgroup[i]="1hours"}
if(AbsenteeismCont$Absenteeism.time.in.hours[i]>=2 && AbsenteeismCont$Absenteeism.time.in.hours[i]<3 ){AbsenteeismCont$Hoursgroup[i]="2hours"}
if(AbsenteeismCont$Absenteeism.time.in.hours[i]>=3 && AbsenteeismCont$Absenteeism.time.in.hours[i]<8 ){AbsenteeismCont$Hoursgroup[i]="midtimeinhours"}
if(AbsenteeismCont$Absenteeism.time.in.hours[i]>=8){AbsenteeismCont$Hoursgroup[i]="lotofhours" }
}
AbsenteeismCont$Hoursgroup=as.factor(AbsenteeismCont$Hoursgroup)
barplot(table(AbsenteeismCont$Hoursgroup), col = 'lavender')

Clustering algorithms
Fixed 3 or 7 as numbers of clusters, We decided to act like this: -hierarchical methods: complete, single, average and ward with 3 clusters -partitioning methods: K-means with 3 clusters and K-Medoids with 3 clusters and after, with 7 clusters.
We will consider as distances the Spearman and Kendall ones.
Hierarchical
Compute the Spearman and Kendall distances:
#d<-dist(matcomp9)
# Kendall and Spearman
library(factoextra)
dSpearm=get_dist(matcomp9, method = "spearman")
dKend=get_dist(matcomp9, method = "kendall")
fviz_dist(dSpearm)

fviz_dist(dKend)

#ward method Spearman
fit_ward<-hclust(dSpearm,method="ward.D")
plot(fit_ward)
rect.hclust(fit_ward, k=3,border="red")

groups_wardSpearm <- cutree(fit_ward, k=3)
AbsenteeismCont$groups_wardSpearm<-groups_wardSpearm #create the column the the clusters of ward
table(groups_wardSpearm)
## groups_wardSpearm
## 1 2 3
## 256 299 112
table(AbsenteeismCont$Hoursgroup,AbsenteeismCont$groups_wardSpearm)
##
## 1 2 3
## 1hours 25 42 19
## 2hours 46 83 28
## lotofhours 111 116 22
## midtimeinhours 74 58 43
table(Absenteeism_Clustering$Absenteeism.time.in.hours,AbsenteeismCont$groups_wardSpearm)
##
## 1 2 3
## 1 25 42 19
## 2 46 83 28
## 3 38 38 33
## 4 33 15 10
## 5 2 5 0
## 7 1 0 0
## 8 96 91 17
## 16 6 11 1
## 24 4 8 3
## 32 2 2 1
## 40 3 3 0
## 48 0 1 0
#ward method Kendall
fit_ward<-hclust(dKend,method="ward.D")
plot(fit_ward)
rect.hclust(fit_ward, k=3,border="red")

groups_wardKend <- cutree(fit_ward, k=3)
AbsenteeismCont$groups_wardKend<-groups_wardKend #create the column the the clusters of ward
table(groups_wardKend)
## groups_wardKend
## 1 2 3
## 346 209 112
table(AbsenteeismCont$Hoursgroup,AbsenteeismCont$groups_wardKend)
##
## 1 2 3
## 1hours 47 20 19
## 2hours 92 37 28
## lotofhours 138 89 22
## midtimeinhours 69 63 43
table(Absenteeism_Clustering$Absenteeism.time.in.hours,AbsenteeismCont$groups_wardKend)
##
## 1 2 3
## 1 47 20 19
## 2 92 37 28
## 3 42 34 33
## 4 22 26 10
## 5 5 2 0
## 7 0 1 0
## 8 110 77 17
## 16 12 5 1
## 24 10 2 3
## 32 2 2 1
## 40 3 3 0
## 48 1 0 0
# single linkage Spearman
fit_single<-hclust(dSpearm, method="single")
plot(fit_single)
rect.hclust(fit_single, k=3, border="red")

groups_singleSpearm <- cutree(fit_single, k=3)
AbsenteeismCont$groups_singleSpearm<-groups_singleSpearm
table(groups_singleSpearm)
## groups_singleSpearm
## 1 2 3
## 662 4 1
table(AbsenteeismCont$Hoursgroup,groups_singleSpearm)
## groups_singleSpearm
## 1 2 3
## 1hours 85 1 0
## 2hours 157 0 0
## lotofhours 246 3 0
## midtimeinhours 174 0 1
table(Absenteeism_Clustering$Absenteeism.time.in.hours,AbsenteeismCont$groups_singleSpearm)
##
## 1 2 3
## 1 85 1 0
## 2 157 0 0
## 3 109 0 0
## 4 57 0 1
## 5 7 0 0
## 7 1 0 0
## 8 201 3 0
## 16 18 0 0
## 24 15 0 0
## 32 5 0 0
## 40 6 0 0
## 48 1 0 0
# single linkage
fit_single<-hclust(dKend, method="single")
plot(fit_single)
rect.hclust(fit_single, k=3, border="red")

groups_singleKend <- cutree(fit_single, k=3)
AbsenteeismCont$groups_singleKend<-groups_singleKend
table(groups_singleKend)
## groups_singleKend
## 1 2 3
## 664 1 2
table(AbsenteeismCont$Hoursgroup,groups_singleKend)
## groups_singleKend
## 1 2 3
## 1hours 86 0 0
## 2hours 156 0 1
## lotofhours 248 0 1
## midtimeinhours 174 1 0
table(Absenteeism_Clustering$Absenteeism.time.in.hours,AbsenteeismCont$groups_singleKend)
##
## 1 2 3
## 1 86 0 0
## 2 156 0 1
## 3 109 0 0
## 4 57 1 0
## 5 7 0 0
## 7 1 0 0
## 8 203 0 1
## 16 18 0 0
## 24 15 0 0
## 32 5 0 0
## 40 6 0 0
## 48 1 0 0
# complete linkage spearman
fit_complete<-hclust(dSpearm, method="complete")
plot(fit_complete)
rect.hclust(fit_complete, k=3, border="red")

groups_completeSpearm <- cutree(fit_complete, k=3)
AbsenteeismCont$groups_completeSpearm<-groups_completeSpearm
table(groups_completeSpearm)
## groups_completeSpearm
## 1 2 3
## 294 268 105
table(AbsenteeismCont$Hoursgroup,groups_completeSpearm)
## groups_completeSpearm
## 1 2 3
## 1hours 40 37 9
## 2hours 64 72 21
## lotofhours 102 104 43
## midtimeinhours 88 55 32
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_completeSpearm)
## groups_completeSpearm
## 1 2 3
## 1 40 37 9
## 2 64 72 21
## 3 60 37 12
## 4 26 13 19
## 5 1 5 1
## 7 1 0 0
## 8 89 79 36
## 16 4 10 4
## 24 6 9 0
## 32 2 2 1
## 40 1 3 2
## 48 0 1 0
# complete linkage kendal
fit_complete<-hclust(dKend, method="complete")
plot(fit_complete)
rect.hclust(fit_complete, k=3, border="red")

groups_completeKend <- cutree(fit_complete, k=3)
AbsenteeismCont$groups_completeKend<-groups_completeKend
table(groups_completeKend)
## groups_completeKend
## 1 2 3
## 246 246 175
table(AbsenteeismCont$Hoursgroup,groups_completeKend)
## groups_completeKend
## 1 2 3
## 1hours 34 28 24
## 2hours 75 51 31
## lotofhours 80 99 70
## midtimeinhours 57 68 50
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_completeKend)
## groups_completeKend
## 1 2 3
## 1 34 28 24
## 2 75 51 31
## 3 32 42 35
## 4 21 23 14
## 5 4 3 0
## 7 0 0 1
## 8 66 79 59
## 16 7 8 3
## 24 5 5 5
## 32 2 1 2
## 40 0 5 1
## 48 0 1 0
# average linkage spearman
fit_average<-hclust(dSpearm, method="average")
plot(fit_average)
rect.hclust(fit_average, k=3, border="red")

groups_averageSpearm <- cutree(fit_average, k=3)
table(groups_averageSpearm)
## groups_averageSpearm
## 1 2 3
## 346 206 115
AbsenteeismCont$groups_averageSpearm<-groups_averageSpearm
table(AbsenteeismCont$Hoursgroup,groups_averageSpearm)
## groups_averageSpearm
## 1 2 3
## 1hours 47 20 19
## 2hours 94 35 28
## lotofhours 135 90 24
## midtimeinhours 70 61 44
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_averageSpearm)
## groups_averageSpearm
## 1 2 3
## 1 47 20 19
## 2 94 35 28
## 3 42 34 33
## 4 23 24 11
## 5 5 2 0
## 7 0 1 0
## 8 107 79 18
## 16 13 4 1
## 24 9 2 4
## 32 2 2 1
## 40 3 3 0
## 48 1 0 0
# average linkage kendal
fit_average<-hclust(dKend, method="average")
plot(fit_average)
rect.hclust(fit_average, k=3, border="red")

groups_averageKend <- cutree(fit_average, k=3)
table(groups_averageKend)
## groups_averageKend
## 1 2 3
## 341 114 212
AbsenteeismCont$groups_averageKend<-groups_averageKend
table(AbsenteeismCont$Hoursgroup,groups_averageKend)
## groups_averageKend
## 1 2 3
## 1hours 46 19 21
## 2hours 90 28 39
## lotofhours 134 24 91
## midtimeinhours 71 43 61
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_averageKend)
## groups_averageKend
## 1 2 3
## 1 46 19 21
## 2 90 28 39
## 3 42 33 34
## 4 24 10 24
## 5 5 0 2
## 7 0 0 1
## 8 107 18 79
## 16 12 1 5
## 24 9 4 2
## 32 2 1 2
## 40 3 0 3
## 48 1 0 0
# centroid method spearman
fit_centroid<-hclust(dSpearm, method="centroid")
plot(fit_centroid)
rect.hclust(fit_centroid, k=3, border="red")

groups_centroidSpearm <- cutree(fit_centroid, k=3)
table(groups_centroidSpearm)
## groups_centroidSpearm
## 1 2 3
## 664 2 1
AbsenteeismCont$groups_centroidSpearm<-groups_centroidSpearm
table(AbsenteeismCont$Hoursgroup,groups_centroidSpearm)
## groups_centroidSpearm
## 1 2 3
## 1hours 86 0 0
## 2hours 157 0 0
## lotofhours 247 2 0
## midtimeinhours 174 0 1
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_centroidSpearm)
## groups_centroidSpearm
## 1 2 3
## 1 86 0 0
## 2 157 0 0
## 3 109 0 0
## 4 57 0 1
## 5 7 0 0
## 7 1 0 0
## 8 203 1 0
## 16 18 0 0
## 24 14 1 0
## 32 5 0 0
## 40 6 0 0
## 48 1 0 0
# centroid method kendal
fit_centroid<-hclust(dKend, method="centroid")
plot(fit_centroid)
rect.hclust(fit_centroid, k=3, border="red")

groups_centroidKend <- cutree(fit_centroid, k=3)
table(groups_centroidKend)
## groups_centroidKend
## 1 2 3
## 662 4 1
AbsenteeismCont$groups_centroidKend<-groups_centroidKend
table(AbsenteeismCont$Hoursgroup,groups_centroidKend)
## groups_centroidKend
## 1 2 3
## 1hours 86 0 0
## 2hours 157 0 0
## lotofhours 245 4 0
## midtimeinhours 174 0 1
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_centroidKend)
## groups_centroidKend
## 1 2 3
## 1 86 0 0
## 2 157 0 0
## 3 109 0 0
## 4 57 0 1
## 5 7 0 0
## 7 1 0 0
## 8 201 3 0
## 16 17 1 0
## 24 15 0 0
## 32 5 0 0
## 40 6 0 0
## 48 1 0 0
All the hierarchical methods don’t help to discriminate between low and high absenteeism hours. But we keep Complete linkage with Kendall distance and Average linkage with Kennedy distance, and we compare them:
library(dendextend)
# Create multiple dendrograms by chaining
dend_complete <- matcomp9 %>% get_dist(method = "kendall") %>% hclust("complete") %>% as.dendrogram
dend_average <- matcomp9 %>% get_dist(method = "kendall") %>% hclust("average") %>% as.dendrogram
dend_completeS <- matcomp9 %>% get_dist(method = "spearman") %>% hclust("complete") %>% as.dendrogram
# Compute correlation matrix
dend_list <- dendlist("Complete" = dend_complete,
"Average" = dend_average, "CompleteS" = dend_completeS)
cors <- cor.dendlist(dend_list)
# Print correlation matrix
round(cors, 2)
## Complete Average CompleteS
## Complete 1.00 0.75 0.7
## Average 0.75 1.00 0.8
## CompleteS 0.70 0.80 1.0
library(corrplot)
corrplot(cors, "pie", "lower")

tanglegram(dend_complete, dend_average)
The two methods, as also the correlation said, are discordant between them.
entanglement(dend_complete, dend_average)
## [1] 0.3644783
dend_list <- dendlist(dend_complete, dend_average)
# Cophenetic correlation matrix
cor.dendlist(dend_list, method = "cophenetic")
## [,1] [,2]
## [1,] 1.0000000 0.7455586
## [2,] 0.7455586 1.0000000
we want close to 1 as much as possible, and we have 0.74.
# Baker correlation matrix
cor.dendlist(dend_list, method = "baker")
## [,1] [,2]
## [1,] 1.0000000 0.5109028
## [2,] 0.5109028 1.0000000
0.51 estimation lower than Cophenetic and in a worst way.
Partitioning algorithms
Now we will apply k-means and k-medoids and compare the results, we expect the k-medoids method be more accurate in general.
clk=kmeans(matcomp9, 3, iter.max = 100, nstart =2365 ,
algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"), trace=FALSE)
#clk
Looking that between_SS / total_SS = 31.4 % %
Considering the relevance of each variable for the discrimination in clusters:
AbsenteeismCont$clusterKM<-as.factor(clk$cluster)
#str(AbsenteeismCont) #1 to 15, 17
summary(aov(AbsenteeismCont[,1]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 1154919 577460 222.8 <2e-16 ***
## Residuals 664 1720714 2591
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,2]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 96598 48299 634.9 <2e-16 ***
## Residuals 664 50516 76
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,3]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 5292 2646.1 217.7 <2e-16 ***
## Residuals 664 8072 12.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,4]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 1747 873.7 23.54 1.33e-10 ***
## Residuals 664 24646 37.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,5]~AbsenteeismCont$clusterKM,data=AbsenteeismCont)) #no, Hit.target
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 92 45.97 4.95 0.00735 **
## Residuals 664 6167 9.29
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,6]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 305.8 152.90 205.9 <2e-16 ***
## Residuals 664 493.2 0.74
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,7]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 360.0 179.99 155.4 <2e-16 ***
## Residuals 664 769.2 1.16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,8]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 16931 8466 62.01 <2e-16 ***
## Residuals 664 90651 137
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,9]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 5383 2691.5 94.07 <2e-16 ***
## Residuals 664 18998 28.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,10]~AbsenteeismCont$clusterKM,data=AbsenteeismCont)) #no, Absenteeism.time.in.hours
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 263 131.74 3.49 0.0311 *
## Residuals 664 25064 37.75
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,11]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 3064 1532.1 119.1 <2e-16 ***
## Residuals 664 8545 12.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,12]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 492943 246471 576.2 <2e-16 ***
## Residuals 664 284035 428
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,13]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 157.4 78.70 44.66 <2e-16 ***
## Residuals 664 1170.3 1.76
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,14]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 2.35 1.1745 2.747 0.0649 .
## Residuals 664 283.95 0.4276
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,15]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))#no, Hour.Work.load.Average.day
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 12 6.001 3.274 0.0385 *
## Residuals 664 1217 1.833
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,16]~AbsenteeismCont$clusterKM,data=AbsenteeismCont))#No, Number.of.days.absent
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKM 2 1865 932.4 50.65 <2e-16 ***
## Residuals 664 12223 18.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
In general p-value 0, that means that the belonging in the cluster is signicative along the 16 variables. Less significant difference is with some variables: hit target, absenteeism in hours, hour work load average day and absents’ days.
This result could be considered already negative for our analysis, since we wanted that the clusters could discriminate between low and high hours of absence, but the p value is 0.0311.
Without considering some of them, but considering absenteeism.time.in.hours anyway, we performed a description of the clusters:
CL_Means_KM<-matrix(rep(0,(13*3)),nrow=13)
#str(AbsenteeismCont)
#dim(AbsenteeismCont)
rownames(CL_Means_KM)<-names(AbsenteeismCont)[-c(5,15,16,17:28)]
colnames(CL_Means_KM)<-c("CL1","CL2", "CL3")
CL_Means_KM[,1]<-round(colMeans(subset(AbsenteeismCont,clusterKM==1)[,-c(5,15,16,17:28)]),2)
CL_Means_KM[,2]<-round(colMeans(subset(AbsenteeismCont,clusterKM==2)[,-c(5,15,16,17:28)]),2)
CL_Means_KM[,3]<-round(colMeans(subset(AbsenteeismCont,clusterKM==3)[,-c(5,15,16,17:28)]),2)
CL_Means_KM
## CL1 CL2 CL3
## Transportation.expense 268.93 183.91 184.91
## Distance.from.Residence.to.Work 33.83 50.87 17.39
## Service.time 10.39 18.49 12.17
## Age 34.30 38.49 36.96
## Son 1.75 0.00 0.63
## Pet 1.61 0.09 0.13
## Weight 75.97 89.85 76.86
## Height 169.57 170.09 175.48
## Absenteeism.time.in.hours 5.93 4.20 5.81
## BMI 26.45 31.05 24.86
## Freq.absence 39.52 107.22 30.91
## Freq.failure 1.89 0.96 0.89
## Hour.Work.load.Average.day 4.53 4.38 4.55
CL1: high transportation expense, low service time, low age, quite high number of sons and pet, not the highest freq.absence but the highest freq.failure
CL2: high distance from residence to work, high service time, no son, high BMI and highest Freq.absence.
CL3: low distance from residence to work, oldest
table(AbsenteeismCont$clusterKM,AbsenteeismCont$Hoursgroup)
##
## 1hours 2hours lotofhours midtimeinhours
## 1 32 51 123 74
## 2 19 28 22 43
## 3 35 78 104 58
table(AbsenteeismCont$clusterKM,AbsenteeismCont$Absenteeism.time.in.hours)
##
## 1 2 3 4 5 7 8 16 24 32 40 48
## 1 32 51 40 30 3 1 107 7 3 2 4 0
## 2 19 28 33 10 0 0 17 1 3 1 0 0
## 3 35 78 36 18 4 0 80 10 9 2 2 1
From the p-values but also for the table, we can not really link clusters and hours. But we can just say that cluster 2 usually has less absent people and these people are: quite far from work, highest service time, oldest, don’t have son and pet but have an high freq.absence. Then, probably they skip work often but just for an average of 4.20hours.
Arriving at this point, we asked if it could be possible to see also if the clustering explains the reason of absence:
table(Absenteeism_Clustering$Reason.for.absence.short,AbsenteeismCont$clusterKM )
##
## 1 2 3
## Accompanying person 32 0 6
## Dental consultation 47 27 33
## Diagnosis, donation and vaccination 12 4 24
## Diseases 60 22 93
## Injury, poisoning 21 0 13
## Medical consultation 74 18 55
## Physiotheraphy 4 38 26
## Pregnancy, childbirth, perinatal complications 2 0 4
## Symptons and abnormal exams 7 2 11
## Unjustified 21 1 10
And we can just say that CL2 use physiotherapy as reason mostly, and never accompanying person, injury, poisoning and pregnancy stuffs. It could make sense since they don’t have son and they are the oldest.
All the results should be “taken them with a grain of sal”.
Because it is possible that the cleaned data set still has some outliers, we believe the K-medoids algorithm will be more accurate because less influenced by the presence of outliers. The difference between K-means and K-medoids consists in this: K-means consider a non existing point as centroid, K-medoids consider the centroid by the one of the points located near the center of the cluster.
library(fpc)
clmSpearm=pamk(dSpearm, k=3, criterion="asw", usepam=TRUE,
scaling=FALSE, alpha=0.001, diss=TRUE,
critout=FALSE, ns=10, seed=NULL)
#clm$pamobject$medoids
#clm$pamobject$clustering
#clm$pamobject$id.med
table(clmSpearm$pamobject$clustering)
##
## 1 2 3
## 238 235 194
clmKen=pamk(dKend, k=3, criterion="asw", usepam=TRUE,
scaling=FALSE, alpha=0.001, diss=TRUE,
critout=FALSE, ns=10, seed=NULL)
#clmKen
table(clmKen$pamobject$clustering)
##
## 1 2 3
## 237 201 229
Proceeding doing the same things we did for k-means technique:
AbsenteeismCont$clusterKMed<-as.factor(clmSpearm$pamobject$clustering)
#str(AbsenteeismCont)
summary(aov(AbsenteeismCont[,1]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 417663 208831 56.41 <2e-16 ***
## Residuals 664 2457970 3702
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,2]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 52331 26165 183.3 <2e-16 ***
## Residuals 664 94783 143
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,3]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 230 114.81 5.804 0.00317 **
## Residuals 664 13135 19.78
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,4]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 915 457.3 11.92 8.21e-06 ***
## Residuals 664 25478 38.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,5]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 214 107.2 11.77 9.46e-06 ***
## Residuals 664 6045 9.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,6]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 56.4 28.203 25.22 2.78e-11 ***
## Residuals 664 742.6 1.118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,7]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 88.6 44.32 28.29 1.63e-12 ***
## Residuals 664 1040.5 1.57
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,8]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 5096 2548.2 16.51 1.01e-07 ***
## Residuals 664 102485 154.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,9]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 2701 1350.6 41.37 <2e-16 ***
## Residuals 664 21680 32.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,10]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont)) #no, Absenteeism.time.in.hours
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 14 6.96 0.183 0.833
## Residuals 664 25313 38.12
summary(aov(AbsenteeismCont[,11]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 1357 678.4 43.94 <2e-16 ***
## Residuals 664 10252 15.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,12]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 119803 59901 60.52 <2e-16 ***
## Residuals 664 657175 990
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,13]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 105.2 52.59 28.56 1.26e-12 ***
## Residuals 664 1222.5 1.84
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,14]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 121.2 60.58 243.6 <2e-16 ***
## Residuals 664 165.1 0.25
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,15]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont)) #no, Hour.Work.load.Average.day
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 14.4 7.214 3.943 0.0198 *
## Residuals 664 1214.8 1.829
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,16]~AbsenteeismCont$clusterKMed,data=AbsenteeismCont)) #no, Number.of.days.absent
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed 2 1259 629.3 32.57 3.21e-14 ***
## Residuals 664 12829 19.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The belonging of the clustering is not significant for: absents’ hours and days and hour.work.load.average.day. Also in this case, it is a negative result that the belonging of the cluster doesn’t discriminate in Absenteeism.time.in.hours (p-value=0.0118)
fviz_cluster(object=list(data=AbsenteeismCont_Norm_presel, cluster=clmSpearm$pamobject$clustering), repel=TRUE, show.clust.cent=TRUE , palette="NULL",ggthem=theme_minimal(), main="K-medoids with Spearman Distance of the PC", geom=c("point"), ellipse=TRUE)

CL_Means_KMed<-matrix(rep(0,(3*14)),nrow=14)
#str(AbsenteeismCont)
#dim(AbsenteeismCont)
rownames(CL_Means_KMed)<-names(AbsenteeismCont)[-c(15,16,17:29)]
colnames(CL_Means_KMed)<-c("CL1","CL2","CL3")
CL_Means_KMed[,1]<-round(colMeans(subset(AbsenteeismCont,clusterKMed==1)[,-c(15,16,17:29)]),2)
CL_Means_KMed[,2]<-round(colMeans(subset(AbsenteeismCont,clusterKMed==2)[,-c(15,16,17:29)]),2)
CL_Means_KMed[,3]<-round(colMeans(subset(AbsenteeismCont,clusterKMed==3)[,-c(15,16,17:29)]),2)
CL_Means_KMed
## CL1 CL2 CL3
## Transportation.expense 252.72 195.40 209.71
## Distance.from.Residence.to.Work 39.03 18.41 32.65
## Service.time 13.26 11.95 12.19
## Age 36.16 37.40 34.46
## Hit.target 94.30 95.63 95.12
## Son 1.37 0.71 0.87
## Pet 0.89 0.27 1.14
## Weight 82.33 77.16 76.01
## Height 170.15 174.75 171.25
## Absenteeism.time.in.hours 5.78 5.53 5.43
## BMI 28.45 25.21 25.90
## Freq.absence 54.38 29.46 60.37
## Freq.failure 1.79 0.85 1.32
## Hour.Work.load.Average.day 4.16 4.33 5.17
CL1: Highest transportation.expense, high number of sons an pets, low freq.absence, high freq.failure
CL2: Lowest distance, low freq.failure
CL3: Highest distance, highest freq.absence
Comparing with K-means:
CL_Means_KM
## CL1 CL2 CL3
## Transportation.expense 268.93 183.91 184.91
## Distance.from.Residence.to.Work 33.83 50.87 17.39
## Service.time 10.39 18.49 12.17
## Age 34.30 38.49 36.96
## Son 1.75 0.00 0.63
## Pet 1.61 0.09 0.13
## Weight 75.97 89.85 76.86
## Height 169.57 170.09 175.48
## Absenteeism.time.in.hours 5.93 4.20 5.81
## BMI 26.45 31.05 24.86
## Freq.absence 39.52 107.22 30.91
## Freq.failure 1.89 0.96 0.89
## Hour.Work.load.Average.day 4.53 4.38 4.55
The two algorithms agree in some descriptions:
the people with highest absenteeism hours have low freq.absence, high freq.failure, high transportation expense and sons/pet.
the people with lowest absenteeism hours don’t have sons/pets almost, oldest.
Anyway, the clusters are confused.
table(clmSpearm$pamobject$clustering)
##
## 1 2 3
## 238 235 194
table(clmSpearm$pamobject$clustering, AbsenteeismCont$Hoursgroup)
##
## 1hours 2hours lotofhours midtimeinhours
## 1 25 35 101 77
## 2 30 64 88 53
## 3 31 58 60 45
table(clmSpearm$pamobject$clustering, AbsenteeismCont$Absenteeism.time.in.hours)
##
## 1 2 3 4 5 7 8 16 24 32 40 48
## 1 25 35 40 35 1 1 89 5 3 2 2 0
## 2 30 64 34 15 4 0 72 7 5 2 1 1
## 3 31 58 35 8 2 0 43 6 7 1 3 0
#table(clmSpearm$pamobject$clustering, AbsenteeismCont$Freq.absence)
Considering the reason:
table( Absenteeism_Clustering$Reason.for.absence.short,clmSpearm$pamobject$clustering)
##
## 1 2 3
## Accompanying person 21 7 10
## Dental consultation 46 31 30
## Diagnosis, donation and vaccination 10 21 9
## Diseases 42 78 55
## Injury, poisoning 16 10 8
## Medical consultation 52 51 44
## Physiotheraphy 23 14 31
## Pregnancy, childbirth, perinatal complications 1 4 1
## Symptons and abnormal exams 7 9 4
## Unjustified 20 10 2
Comparing with K.means:
table(Absenteeism_Clustering$Reason.for.absence.short,AbsenteeismCont$clusterKM )
##
## 1 2 3
## Accompanying person 32 0 6
## Dental consultation 47 27 33
## Diagnosis, donation and vaccination 12 4 24
## Diseases 60 22 93
## Injury, poisoning 21 0 13
## Medical consultation 74 18 55
## Physiotheraphy 4 38 26
## Pregnancy, childbirth, perinatal complications 2 0 4
## Symptons and abnormal exams 7 2 11
## Unjustified 21 1 10
They don’t agree.
- K-MEDOIDS with 7 clusters
library(fpc)
clmSpearm7=pamk(dSpearm, k=7, criterion="asw", usepam=TRUE,
scaling=FALSE, alpha=0.001, diss=TRUE,
critout=FALSE, ns=10, seed=NULL)
table(clmSpearm7$pamobject$clustering)
##
## 1 2 3 4 5 6 7
## 111 94 84 75 118 98 87
clmKen7=pamk(dKend, k=7, criterion="asw", usepam=TRUE,
scaling=FALSE, alpha=0.001, diss=TRUE,
critout=FALSE, ns=10, seed=NULL)
table(clmKen7$pamobject$clustering)
##
## 1 2 3 4 5 6 7
## 105 79 114 112 88 109 60
Proceeding doing the same things we did for k-means.
AbsenteeismCont$clusterKMed7<-as.factor(clmSpearm7$pamobject$clustering)
#str(AbsenteeismCont)
summary(aov(AbsenteeismCont[,1]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 1313152 218859 92.45 <2e-16 ***
## Residuals 660 1562481 2367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,2]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 94442 15740 197.2 <2e-16 ***
## Residuals 660 52672 80
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,3]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 6687 1114.5 110.2 <2e-16 ***
## Residuals 660 6678 10.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,4]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 9470 1578.3 61.55 <2e-16 ***
## Residuals 660 16923 25.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,5]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 811 135.22 16.38 <2e-16 ***
## Residuals 660 5448 8.25
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,6]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 497.9 82.98 181.9 <2e-16 ***
## Residuals 660 301.1 0.46
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,7]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 469.3 78.22 78.25 <2e-16 ***
## Residuals 660 659.8 1.00
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,8]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 21920 3653 28.15 <2e-16 ***
## Residuals 660 85662 130
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,9]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 3235 539.1 16.83 <2e-16 ***
## Residuals 660 21147 32.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,10]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 1130 188.37 5.138 3.57e-05 ***
## Residuals 660 24197 36.66
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,11]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 3415 569.2 45.85 <2e-16 ***
## Residuals 660 8194 12.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,12]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 540352 90059 251.2 <2e-16 ***
## Residuals 660 236626 359
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,13]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 281.1 46.85 29.54 <2e-16 ***
## Residuals 660 1046.6 1.59
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,14]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 103.5 17.247 62.27 <2e-16 ***
## Residuals 660 182.8 0.277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,15]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 55 9.158 5.148 3.48e-05 ***
## Residuals 660 1174 1.779
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(AbsenteeismCont[,16]~AbsenteeismCont$clusterKMed7,data=AbsenteeismCont))
## Df Sum Sq Mean Sq F value Pr(>F)
## AbsenteeismCont$clusterKMed7 6 3935 655.9 42.64 <2e-16 ***
## Residuals 660 10153 15.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The belonging of the clustering is significant for all the variables in this case, also for absenteeism time in hours!
CL_Means_KMed7<-matrix(rep(0,(7*16)),nrow=16)
#str(AbsenteeismCont)
#dim(AbsenteeismCont)
rownames(CL_Means_KMed7)<-names(AbsenteeismCont)[-c(17:30)]
colnames(CL_Means_KMed7)<-c("CL1","CL2","CL3","CL4","CL5","CL6","CL7")
CL_Means_KMed7[,1]<-round(colMeans(subset(AbsenteeismCont,clusterKMed7==1)[,-c(17:30)]),2)
CL_Means_KMed7[,2]<-round(colMeans(subset(AbsenteeismCont,clusterKMed7==2)[,-c(17:30)]),2)
CL_Means_KMed7[,3]<-round(colMeans(subset(AbsenteeismCont,clusterKMed7==3)[,-c(17:30)]),2)
CL_Means_KMed7[,4]<-round(colMeans(subset(AbsenteeismCont,clusterKMed7==4)[,-c(17:30)]),2)
CL_Means_KMed7[,5]<-round(colMeans(subset(AbsenteeismCont,clusterKMed7==5)[,-c(17:30)]),2)
CL_Means_KMed7[,6]<-round(colMeans(subset(AbsenteeismCont,clusterKMed7==6)[,-c(17:30)]),2)
CL_Means_KMed7[,7]<-round(colMeans(subset(AbsenteeismCont,clusterKMed7==7)[,-c(17:30)]),2)
CL_Means_KMed7
## CL1 CL2 CL3 CL4 CL5 CL6
## Transportation.expense 286.05 233.76 208.80 212.76 192.32 253.90
## Distance.from.Residence.to.Work 34.72 20.44 22.79 21.41 50.91 34.30
## Service.time 12.14 10.31 12.76 14.89 17.78 7.58
## Age 35.51 33.96 37.35 42.69 37.99 28.99
## Hit.target 93.63 96.84 95.48 95.24 95.36 95.28
## Son 2.37 0.30 1.04 1.95 0.04 1.22
## Pet 0.62 0.17 0.57 0.99 0.25 2.64
## Weight 79.10 73.84 78.73 77.53 89.22 70.55
## Height 170.24 175.44 175.51 171.89 170.16 170.16
## Absenteeism.time.in.hours 7.28 6.68 7.08 5.19 4.56 4.28
## BMI 27.27 23.93 25.48 26.32 30.81 24.37
## Freq.absence 31.46 24.77 32.76 16.11 102.81 57.60
## Freq.failure 2.46 0.85 1.79 0.43 0.92 1.59
## Hour.Work.load.Average.day 4.35 4.24 5.52 4.30 4.34 4.57
## Number.of.days.absent 1.67 1.59 1.28 1.21 1.06 0.93
## First.start 23.38 23.65 24.58 27.80 20.21 21.41
## CL7
## Transportation.expense 137.39
## Distance.from.Residence.to.Work 14.80
## Service.time 11.30
## Age 37.74
## Hit.target 93.34
## Son 0.15
## Pet 0.00
## Weight 79.09
## Height 172.49
## Absenteeism.time.in.hours 4.05
## BMI 26.51
## Freq.absence 46.22
## Freq.failure 0.95
## Hour.Work.load.Average.day 4.39
## Number.of.days.absent 0.91
## First.start 26.44
CL1: Lowest distance, oldest, highest freq.failure
CL2: Highest numbers of hours.load.day
CL3: with more sons, highest number of hours
CL4: highest service time, no sons/pets, most height
CL5: lowest freq.failure
CL6: lowest service time, youngest, lowest BMI
CL7: lowest transportation expense, no pets, lowest number of hours
The results are similar for the high number of hours but sometimes, different in the description of high number of hours. The clusters are still confused also with 7 groups.
fviz_cluster(object=list(data=AbsenteeismCont_Norm_presel, cluster=clmSpearm7$pamobject$clustering), repel=TRUE, show.clust.cent=TRUE , palette="NULL",ggthem=theme_minimal(), main="K-medoids with Spearman Distance of the PC", geom=c("point"), ellipse=TRUE)

table(clmSpearm7$pamobject$clustering)
##
## 1 2 3 4 5 6 7
## 111 94 84 75 118 98 87
table(clmSpearm7$pamobject$clustering, AbsenteeismCont$Hoursgroup)
##
## 1hours 2hours lotofhours midtimeinhours
## 1 6 8 64 33
## 2 7 25 44 18
## 3 15 14 39 16
## 4 10 22 32 11
## 5 20 28 25 45
## 6 13 28 26 31
## 7 15 32 19 21
table(clmSpearm7$pamobject$clustering, AbsenteeismCont$Absenteeism.time.in.hours)
##
## 1 2 3 4 5 7 8 16 24 32 40 48
## 1 6 8 11 21 1 0 55 3 3 2 1 0
## 2 7 25 13 5 0 0 34 5 3 0 1 1
## 3 15 14 9 6 1 0 27 5 5 0 2 0
## 4 10 22 6 4 1 0 29 2 0 1 0 0
## 5 20 28 33 11 0 1 19 1 3 1 1 0
## 6 13 28 23 7 1 0 24 1 0 0 1 0
## 7 15 32 14 4 3 0 16 1 1 1 0 0
Considering the reason:
table( Absenteeism_Clustering$Reason.for.absence.short,clmSpearm7$pamobject$clustering)
##
## 1 2 3 4 5 6 7
## Accompanying person 12 4 4 5 2 11 0
## Dental consultation 22 11 13 5 27 12 17
## Diagnosis, donation and vaccination 3 9 0 14 5 6 3
## Diseases 23 35 35 18 24 20 20
## Injury, poisoning 11 4 3 4 0 8 4
## Medical consultation 20 13 19 19 19 34 23
## Physiotheraphy 0 9 2 0 38 4 15
## Pregnancy, childbirth, perinatal complications 1 1 1 3 0 0 0
## Symptons and abnormal exams 3 3 3 6 2 2 1
## Unjustified 16 5 4 1 1 1 4
Clustering with the original variables
We tried to perform the clustering analysis with the original variables, pre-selected and standardized and we obtain the same messy results (see Annex)
Dataset’s creation with unique IDs
At this point, we think it could be interesting to create a new data set from the original one with the IDs not duplicated.
New variables created/transformed: * Sum.Absenteeism.time.in.hours
Avg.Absenteeism.time.in.hours
Avg.Hour.Work.load.Average.day
Sum.Number.of.days.absent
Avg.Number.of.days.absent
Avg.Hit.target
#str(Absenteeism_withcatnames)
Absenteeism_ClusteringID=Absenteeism_withcatnames
Absenteeism_ClusteringID$ID=as.factor(Absenteeism_ClusteringID$ID) #to check to have all the IDs
#str(Absenteeism_ClusteringID ) #739 obs 36 IDs
Absenteeism_ClusteringID=subset(Absenteeism_ClusteringID, select=-c(Day.of.the.week.nom, Month.of.absence.nom, Seasons.nom, Reason.for.absence, Month.of.absence,Day.of.the.week,Seasons,Work.load.Average.day, Social.drinker, Social.smoker, Body.mass.index, Reason.for.absence.short
) )
#deleting some variables that they change for the same ID
#str(Absenteeism_ClusteringID)
##Making the 'UNIQUE ID DATASET'
#Unique DataSet with the removed 40 lines (cause we don't want to sum to work load hours of those ones)
UniqueIDwithoutfailures <- Absenteeism_ClusteringID[!(Absenteeism_ClusteringID$Disciplinary.failure==1),]
str(UniqueIDwithoutfailures) #700 obs 36 IDs
## 'data.frame': 699 obs. of 20 variables:
## $ ID : Factor w/ 36 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Transportation.expense : int 235 235 235 235 235 235 235 235 235 235 ...
## $ Distance.from.Residence.to.Work: int 11 11 11 11 11 11 11 11 11 11 ...
## $ Service.time : int 14 14 14 14 14 14 14 14 14 14 ...
## $ Age : int 37 37 37 37 37 37 37 37 37 37 ...
## $ Hit.target : int 94 91 93 94 93 98 93 99 97 93 ...
## $ Disciplinary.failure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Education : Ord.factor w/ 3 levels "High School"<..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Son : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Pet : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Weight : int 88 88 88 88 88 88 88 88 88 88 ...
## $ Height : int 172 172 172 172 172 172 172 172 172 172 ...
## $ Absenteeism.time.in.hours : int 1 8 4 16 4 8 4 8 8 3 ...
## $ BMI : num 29.8 29.8 29.8 29.8 29.8 ...
## $ Freq.absence : num 22 22 22 22 22 22 22 22 22 22 ...
## $ Freq.failure : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Hour.Work.load.Average.day : num 4.43 4.1 3.94 6.29 4.16 ...
## $ Number.of.days.absent : num 0.226 1.949 1.014 2.543 0.961 ...
## $ First.start : int 23 23 23 23 23 23 23 23 23 23 ...
## $ Bad.habits : chr "None" "None" "None" "None" ...
##Absenteeism.time.in.hours
#sum the hours missed per ID to have the amount of missed work throughout the 3 years
library(Hmisc)
sumvalue <- summarize(UniqueIDwithoutfailures$Absenteeism.time.in.hours,UniqueIDwithoutfailures$ID,sum)
sumvalue <- as.matrix(cbind(Sum.Absenteeism.time.in.hours=sumvalue[,2], ID=sumvalue[,1]))
#average the hours missed per ID to have the amount of missed work throughout the 3 years
avgvalue <- summarize(UniqueIDwithoutfailures$Absenteeism.time.in.hours,UniqueIDwithoutfailures$ID,mean)
avgvalue <- as.matrix(cbind(Avg.Absenteeism.time.in.hours=avgvalue[,2], ID=avgvalue[,1]))
#MERGE 1
Absenteeism_complete_UniqueID <- merge(sumvalue,avgvalue,by="ID",all.y=TRUE)
#Average.Hour.load.Average.day
avgvalue2 <- summarize(UniqueIDwithoutfailures$Hour.Work.load.Average.day,UniqueIDwithoutfailures$ID,mean)
avgvalue2 <- as.matrix(cbind(Avg.Hour.Work.load.Average.day=avgvalue2[,2], ID=avgvalue2[,1]))
#MERGE 2
Absenteeism_complete_UniqueID <- merge(Absenteeism_complete_UniqueID,avgvalue2,by="ID",all.y=TRUE)
#Sum Number.of.days.absent
sumvalue2 <- summarize(UniqueIDwithoutfailures$Number.of.days.absent,UniqueIDwithoutfailures$ID,sum)
sumvalue2 <- as.matrix(cbind(Sum.Number.of.days.absent=sumvalue2[,2], ID=sumvalue2[,1]))
#MERGE 3
Absenteeism_complete_UniqueID <- merge(Absenteeism_complete_UniqueID,sumvalue2,by="ID",all.y=TRUE)
#Avg Number.of.days.absent
avgvalue3 <- summarize(UniqueIDwithoutfailures$Number.of.days.absent,UniqueIDwithoutfailures$ID,mean)
avgvalue3 <- as.matrix(cbind(Avg.Number.of.days.absent=avgvalue3[,2], ID=avgvalue3[,1]))
#MERGE 4
Absenteeism_complete_UniqueID <- merge(Absenteeism_complete_UniqueID,avgvalue3,by="ID",all.y=TRUE)
#Average Hit Target
avgvalue4 <- summarize(UniqueIDwithoutfailures$Hit.target,UniqueIDwithoutfailures$ID,mean)
avgvalue4 <- as.matrix(cbind(Avg.Hit.target=avgvalue4[,2], ID=avgvalue4[,1]))
#MERGE 5
Absenteeism_complete_UniqueID <- merge(Absenteeism_complete_UniqueID,avgvalue4,by="ID",all.y=TRUE)
#take the duplicates of Transportation.expense, Distance.from.Residence.to.Work, Service.time, Age, Education, Son, Social.drinker, Social.smoker, Bad.habits, Pet, Weight, Height, Body mass index, Freq.absence
UniqueID_dup <- subset(Absenteeism_ClusteringID, select=c("ID","Transportation.expense","Distance.from.Residence.to.Work", "Service.time", "Age", "Education", "Son", "Bad.habits", "Pet", "Weight", "Height", "BMI", "Freq.absence", "Freq.failure"))
without_dup <- unique (UniqueID_dup)
#MERGE 6
Absenteeism_complete_UniqueID <- merge(Absenteeism_complete_UniqueID,without_dup,by="ID",all.y=TRUE)
#NA in the freq.absence NA is 0.
library(car)
Absenteeism_complete_UniqueID$Freq.absence<-Recode(Absenteeism_complete_UniqueID$Freq.absence, "NA='0'")
Absenteeism_complete_UniqueID is our new data set.
Reason and goals of the clustering analysis for the UniqueID dataset
We would like to perform a clustering to describe Freq.absence with the variables in our disposal. The data set analyzed is with 36 IDs and with 21 variables.
Absenteeism_complete_UniqueID$ID=as.factor(Absenteeism_complete_UniqueID$ID)
#new variables as in the full dataset
Absenteeism_complete_UniqueID$First.start <- Absenteeism_complete_UniqueID$Age-Absenteeism_complete_UniqueID$Service.time
str(Absenteeism_complete_UniqueID)
## 'data.frame': 36 obs. of 21 variables:
## $ ID : Factor w/ 36 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Sum.Absenteeism.time.in.hours : int 121 25 482 0 104 72 30 0 262 186 ...
## $ Avg.Absenteeism.time.in.hours : num 5.5 6.25 4.3 0 7.43 ...
## $ Avg.Hour.Work.load.Average.day : num 4.4 3.53 4.37 4.52 4.38 ...
## $ Sum.Number.of.days.absent : num 27.74 7.25 109.1 0 24.13 ...
## $ Avg.Number.of.days.absent : num 1.261 1.813 0.974 0 1.724 ...
## $ Avg.Hit.target : num 95 92 95.1 95 93.4 ...
## $ Transportation.expense : int 235 235 179 118 235 189 279 231 228 361 ...
## $ Distance.from.Residence.to.Work: int 11 29 51 14 20 29 5 35 14 52 ...
## $ Service.time : int 14 12 18 13 13 13 14 14 16 3 ...
## $ Age : int 37 48 38 40 43 33 39 39 58 28 ...
## $ Education : Ord.factor w/ 3 levels "High School"<..: 3 1 1 1 1 1 1 1 1 1 ...
## $ Son : int 1 1 0 1 1 2 2 2 2 1 ...
## $ Bad.habits : chr "None" "Smoker" "Drinker" "Drinker" ...
## $ Pet : int 1 5 0 8 0 2 0 2 1 4 ...
## $ Weight : int 88 88 89 98 106 69 68 100 65 80 ...
## $ Height : int 172 163 170 170 167 167 168 170 172 172 ...
## $ BMI : num 29.8 33.1 30.8 33.9 38 ...
## $ Freq.absence : num 22 4 112 0 14 8 4 0 8 24 ...
## $ Freq.failure : int 1 2 1 0 5 0 2 1 0 0 ...
## $ First.start : int 23 36 20 27 30 20 25 25 42 25 ...
Histograms and boxplots of the new dataset
To understand the distribution of this new data set, but we will not detect outliers since we want to study all the 36IDs.
par(mfrow=c(2,1))
barplot(table(Absenteeism_complete_UniqueID$Education),col = 'lavender' )
barplot(table(Absenteeism_complete_UniqueID$Bad.habits), col = 'lavender')

par(mfrow=c(3,2))
hist( Absenteeism_complete_UniqueID$Transportation.expense, breaks = sqrt( length( Absenteeism_complete_UniqueID$Transportation.expense ) ), probability = TRUE,
col = 'lavender', main = 'Transportation.expense', xlab = 'Transportation.expense' )
boxplot(Absenteeism_complete_UniqueID$Transportation.expense)
hist( Absenteeism_complete_UniqueID$Distance.from.Residence.to.Work, breaks = sqrt( length( Absenteeism_complete_UniqueID$Distance.from.Residence.to.Work ) ), probability = TRUE,
col = 'lavender', main = 'Distance.from.Residence.to.Work', xlab = 'Distance.from.Residence.to.Work' )
boxplot(Absenteeism_complete_UniqueID$Distance.from.Residence.to.Work)
hist( Absenteeism_complete_UniqueID$Service.time, breaks = sqrt( length( Absenteeism_complete_UniqueID$Service.time ) ), probability = TRUE,
col = 'lavender', main = 'Service.time', xlab = 'Service.time' )
boxplot(Absenteeism_complete_UniqueID$Service.time)

par(mfrow=c(3,2))
hist( Absenteeism_complete_UniqueID$Age, breaks = sqrt( length( Absenteeism_complete_UniqueID$Age ) ), probability = TRUE,
col = 'lavender', main = 'Age', xlab = 'Age' )
boxplot(Absenteeism_complete_UniqueID$Age)
hist( Absenteeism_complete_UniqueID$Son, breaks = sqrt( length( Absenteeism_complete_UniqueID$Son ) ), probability = TRUE,
col = 'lavender', main = 'Son', xlab = 'Son' )
boxplot(Absenteeism_complete_UniqueID$Son)
hist( Absenteeism_complete_UniqueID$Pet, breaks = sqrt( length( Absenteeism_complete_UniqueID$Pet ) ), probability = TRUE,
col = 'lavender', main = 'Pet', xlab = 'Pet' )
boxplot(Absenteeism_complete_UniqueID$Pet)

par(mfrow=c(3,2))
hist( Absenteeism_complete_UniqueID$Weight, breaks = sqrt( length( Absenteeism_complete_UniqueID$Weight ) ), probability = TRUE,
col = 'lavender', main = 'Weight', xlab = 'Weight' )
boxplot(Absenteeism_complete_UniqueID$Weight)
hist( Absenteeism_complete_UniqueID$Height, breaks = sqrt( length( Absenteeism_complete_UniqueID$Height ) ), probability = TRUE,
col = 'lavender', main = 'Height', xlab = 'Height' )
boxplot(Absenteeism_complete_UniqueID$Height)# ID 14, 30, 29, 18, 12, 36, 25, 31
hist( Absenteeism_complete_UniqueID$BMI, breaks = sqrt( length( Absenteeism_complete_UniqueID$BMI ) ), probability = TRUE,
col = 'lavender', main = 'BMI', xlab = 'BMI' )
boxplot(Absenteeism_complete_UniqueID$BMI)

par(mfrow=c(3,2))
hist( Absenteeism_complete_UniqueID$Freq.absence, breaks = sqrt( length( Absenteeism_complete_UniqueID$Freq.absence) ), probability = TRUE,
col = 'lavender', main = 'Freq.absence', xlab = 'Freq.absence' )
boxplot(Absenteeism_complete_UniqueID$Freq.absence)
#ID 3 (112) AND ID 28 (72) have really high freq.absence
hist( Absenteeism_complete_UniqueID$Freq.failure, breaks = sqrt( length( Absenteeism_complete_UniqueID$Freq.failure ) ), probability = TRUE,
col = 'lavender', main = 'Freq.failure', xlab = 'Freq.failure' )
boxplot(Absenteeism_complete_UniqueID$Freq.failure) #ID 36
#plot(Absenteeism_complete_UniqueID$Freq.absence, Absenteeism_complete_UniqueID$Freq.failure) to understand if the same people with freq absence high also have freq failure high, but no!
hist( Absenteeism_complete_UniqueID$First.start, breaks = sqrt( length( Absenteeism_complete_UniqueID$First.start ) ), probability = TRUE,
col = 'lavender', main = 'First.start', xlab = 'First.start' )
boxplot(Absenteeism_complete_UniqueID$First.start) #id 9 , 31

par(mfrow=c(3,2))
hist( Absenteeism_complete_UniqueID$Sum.Absenteeism.time.in.hours, breaks = sqrt( length( Absenteeism_complete_UniqueID$Sum.Absenteeism.time.in.hours ) ), probability = TRUE,
col = 'lavender', main = 'Sum.Absenteeism.time.in.hours', xlab = 'Sum.Absenteeism.time.in.hours' )
boxplot(Absenteeism_complete_UniqueID$Sum.Absenteeism.time.in.hours)
hist( Absenteeism_complete_UniqueID$Avg.Absenteeism.time.in.hours, breaks = sqrt( length( Absenteeism_complete_UniqueID$Avg.Absenteeism.time.in.hours ) ), probability = TRUE,
col = 'lavender', main = 'Avg.Absenteeism.time.in.hours', xlab = 'Avg.Absenteeism.time.in.hours' )
boxplot(Absenteeism_complete_UniqueID$Avg.Absenteeism.time.in.hours)
#plot(Absenteeism_complete_UniqueID$Avg.Absenteeism.time.in.hours,Absenteeism_complete_UniqueID$Freq.absence ) #to understand if the one with high freq are also the one with avg hours absenteim high, but no!
hist( Absenteeism_complete_UniqueID$Avg.Hour.Work.load.Average.day, breaks = sqrt( length( Absenteeism_complete_UniqueID$Avg.Hour.Work.load.Average.day ) ), probability = TRUE,
col = 'lavender', main = 'Avg.Hour.Work.load.Average.day', xlab = 'Avg.Hour.Work.load.Average.day' )
boxplot(Absenteeism_complete_UniqueID$Avg.Hour.Work.load.Average.day)

#plot(Absenteeism_complete_UniqueID$Freq.absence,Absenteeism_complete_UniqueID$Avg.Hour.Work.load.Average.day) to understand if freq absence high means also workloadaverage low, but no!
#plot(Absenteeism_complete_UniqueID$Avg.Absenteeism.time.in.hours,Absenteeism_complete_UniqueID$Avg.Hour.Work.load.Average.day) tounderstand if avg abse hours high means also avg hour load average day low, but no!
par(mfrow=c(3,2))
hist( Absenteeism_complete_UniqueID$Sum.Number.of.days.absent, breaks = sqrt( length( Absenteeism_complete_UniqueID$Sum.Number.of.days.absent ) ), probability = TRUE,
col = 'lavender', main = 'Sum.Number.of.days.absent', xlab = 'Sum.Number.of.days.absent' )
boxplot(Absenteeism_complete_UniqueID$Sum.Number.of.days.absent)
hist( Absenteeism_complete_UniqueID$Avg.Number.of.days.absent, breaks = sqrt( length( Absenteeism_complete_UniqueID$Avg.Number.of.days.absent ) ), probability = TRUE,
col = 'lavender', main = 'Avg.Number.of.days.absent', xlab = 'Avg.Number.of.days.absent' )
boxplot(Absenteeism_complete_UniqueID$Avg.Number.of.days.absent)
#plot(Absenteeism_complete_UniqueID$Avg.Number.of.days.absent,Absenteeism_complete_UniqueID$Sum.Number.of.days.absent ) to control if high sum corresponds to high avg, but since doesn't happen with the hours doesn't happen also with the days
hist( Absenteeism_complete_UniqueID$Avg.Hit.target, breaks = sqrt( length( Absenteeism_complete_UniqueID$Avg.Hit.target ) ), probability = TRUE,
col = 'lavender', main = 'Avg.Hit.target', xlab = 'Avg.Hit.target' )
boxplot(Absenteeism_complete_UniqueID$Avg.Hit.target)

We expect high correlation between the sum of Absenteeism time in hours and Freq absence:
plot(Absenteeism_complete_UniqueID$Sum.Absenteeism.time.in.hours,Absenteeism_complete_UniqueID$Freq.absence )
abline(lm(Absenteeism_complete_UniqueID$Freq.absence~Absenteeism_complete_UniqueID$Sum.Absenteeism.time.in.hours, data=Absenteeism_complete_UniqueID))

cor(Absenteeism_complete_UniqueID$Sum.Absenteeism.time.in.hours,Absenteeism_complete_UniqueID$Freq.absence)
## [1] 0.8203413
cor(Absenteeism_complete_UniqueID$Sum.Absenteeism.time.in.hours,Absenteeism_complete_UniqueID$Freq.absence, method="spearman")
## [1] 0.938706
And the Spear man correlation higher than Pearson suggested that probably there is not only a linear correlation but this could be due to some outliers (verified) in absenteeism.time.in.hours.
#Absenteeism_complete_UniqueID$Freq.absence, Absenteeism_complete_UniqueID$Freq.failure,Absenteeism_complete_UniqueID$Avg.Hour.Work.load.Average.day, Absenteeism_complete_UniqueID$Avg.Absenteeism.time.in.hours to have a global view
require(car)
matr=subset(Absenteeism_complete_UniqueID, select=c(Freq.absence, Freq.failure,Avg.Hour.Work.load.Average.day, Avg.Absenteeism.time.in.hours,Sum.Absenteeism.time.in.hours))
pairs(matr)

#scatterplotMatrix(matr)
require(scatterplot3d)
scatterplot3d(Absenteeism_complete_UniqueID$Freq.failure, Absenteeism_complete_UniqueID$Avg.Hour.Work.load.Average.day, Absenteeism_complete_UniqueID$Sum.Absenteeism.time.in.hours, xlab='Freq.failure',ylab='Hour.Work.load.Average.day',zlab='Sum.Absenteeism.time.in.hours')

Correlation analysis
#str(Absenteeism_complete_UniqueID)
Absenteeism_complete_UniqueIDCon=subset(Absenteeism_complete_UniqueID, select=-c(ID,Education, Bad.habits))
str(Absenteeism_complete_UniqueIDCon) #18 variables
## 'data.frame': 36 obs. of 18 variables:
## $ Sum.Absenteeism.time.in.hours : int 121 25 482 0 104 72 30 0 262 186 ...
## $ Avg.Absenteeism.time.in.hours : num 5.5 6.25 4.3 0 7.43 ...
## $ Avg.Hour.Work.load.Average.day : num 4.4 3.53 4.37 4.52 4.38 ...
## $ Sum.Number.of.days.absent : num 27.74 7.25 109.1 0 24.13 ...
## $ Avg.Number.of.days.absent : num 1.261 1.813 0.974 0 1.724 ...
## $ Avg.Hit.target : num 95 92 95.1 95 93.4 ...
## $ Transportation.expense : int 235 235 179 118 235 189 279 231 228 361 ...
## $ Distance.from.Residence.to.Work: int 11 29 51 14 20 29 5 35 14 52 ...
## $ Service.time : int 14 12 18 13 13 13 14 14 16 3 ...
## $ Age : int 37 48 38 40 43 33 39 39 58 28 ...
## $ Son : int 1 1 0 1 1 2 2 2 2 1 ...
## $ Pet : int 1 5 0 8 0 2 0 2 1 4 ...
## $ Weight : int 88 88 89 98 106 69 68 100 65 80 ...
## $ Height : int 172 163 170 170 167 167 168 170 172 172 ...
## $ BMI : num 29.8 33.1 30.8 33.9 38 ...
## $ Freq.absence : num 22 4 112 0 14 8 4 0 8 24 ...
## $ Freq.failure : int 1 2 1 0 5 0 2 1 0 0 ...
## $ First.start : int 23 36 20 27 30 20 25 25 42 25 ...
Absenteeism_complete_UniqueIDConStand=scale(Absenteeism_complete_UniqueIDCon)
corvarPearson <- round(cor(Absenteeism_complete_UniqueIDConStand),2)
corvarPearson[corvarPearson > -0.5 & corvarPearson < 0.5] <- NA
#View(corvarPearson)
corvarSpearm <- round(cor(Absenteeism_complete_UniqueIDConStand),2)
corvarSpearm[corvarSpearm > -0.5 & corvarSpearm < 0.5] <- NA
#View(corvarSpearm)
library(corrplot)
par(mfrow=c(1,2))
CorrMatrix1 <- data.matrix(Absenteeism_complete_UniqueIDConStand)
corrplot(cor(CorrMatrix1), diag = FALSE, order = "FPC", tl.pos = "td", tl.cex = 0.7, method = "color", type = "upper",number.cex = .6)
corrplot(cor(CorrMatrix1, method="spearman"), diag = FALSE, order = "FPC", tl.pos = "td", tl.cex = 0.7, method = "color", type = "upper",number.cex = .6)

Freq.absence and sum absenteeism.time.in.hours Pears 0.82
Sum.Number.of.days.absent and Freq.absence Pears 0.82
Sum.Absenteeism.time.in.hours and Sum.Number.of.days.absent Pears 1
Avg.Number.of.days.absent and Avg.Absenteeism.time.in.hours Pears 0.99
Age and First.start Pears 0.74
Age and service time Pears 0.63
Weight and BMI Pears 0.91
Because of that we will consider the hours and not the days, first and and service time and not the age, and we will not consider neither the freq neither the sum of days absent because the goal of the cluster is to describe the freq.absence and we will not give as inputs its and sum.abs.hours and sum.abs.days since we want the cluster created without its information. We will check at the end, if it is possible to check a pattern of the clusters with the freq absence!
Absenteeism_complete_UniqueIDConStand_sel=subset(Absenteeism_complete_UniqueIDConStand, select=-c(Age,Weight,Sum.Number.of.days.absent, Sum.Absenteeism.time.in.hours, Freq.absence, Avg.Number.of.days.absent))
Absenteeism_complete_UniqueIDConStand_sel=as.data.frame(Absenteeism_complete_UniqueIDConStand_sel)
str(Absenteeism_complete_UniqueIDConStand_sel) #12 variables
## 'data.frame': 36 obs. of 12 variables:
## $ Avg.Absenteeism.time.in.hours : num -0.3459 -0.2167 -0.552 -1.2933 -0.0137 ...
## $ Avg.Hour.Work.load.Average.day : num -0.3188 -2.9905 -0.3956 0.0677 -0.3665 ...
## $ Avg.Hit.target : num 0.0824 -1.9145 0.0994 0.0526 -0.9778 ...
## $ Transportation.expense : num -0.0308 -0.0308 -0.7877 -1.6123 -0.0308 ...
## $ Distance.from.Residence.to.Work: num -1.115 0.124 1.639 -0.909 -0.495 ...
## $ Service.time : num 0.3035 -0.0732 1.0569 0.1151 0.1151 ...
## $ Son : num -0.136 -0.136 -1.118 -0.136 -0.136 ...
## $ Pet : num -0.133 1.779 -0.611 3.213 -0.611 ...
## $ Height : num -0.167 -1.589 -0.483 -0.483 -0.957 ...
## $ BMI : num 0.708 1.422 0.931 1.589 2.458 ...
## $ Freq.failure : num -0.0777 0.6217 -0.0777 -0.7771 2.7198 ...
## $ First.start : num -0.479 1.636 -0.967 0.172 0.66 ...
#check no correlation
corvarPearson <- round(cor(Absenteeism_complete_UniqueIDConStand_sel),2)
corvarPearson[corvarPearson > -0.5 & corvarPearson < 0.5] <- NA
#View(corvarPearson)
corvarSpearm <- round(cor(Absenteeism_complete_UniqueIDConStand_sel),2)
corvarSpearm[corvarSpearm > -0.5 & corvarSpearm < 0.5] <- NA
#View(corvarSpearm)
Principal Components UniqueID
xnormID.pca <- princomp(Absenteeism_complete_UniqueIDConStand_sel,cor=TRUE, scores = TRUE) #scores=TRUE
summary(xnormID.pca) #to compare with PCA command
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.4607543 1.3504442 1.2509796 1.2049738 1.06478724
## Proportion of Variance 0.1778169 0.1519750 0.1304125 0.1209968 0.09448099
## Cumulative Proportion 0.1778169 0.3297919 0.4602044 0.5812012 0.67568219
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.98905399 0.87522234 0.80691243 0.75705294
## Proportion of Variance 0.08151898 0.06383451 0.05425897 0.04776076
## Cumulative Proportion 0.75720117 0.82103568 0.87529466 0.92305542
## Comp.10 Comp.11 Comp.12
## Standard deviation 0.69761269 0.49829689 0.43401812
## Proportion of Variance 0.04055529 0.02069165 0.01569764
## Cumulative Proportion 0.96361071 0.98430236 1.00000000
#considering the % of variance explained, screeplot
#plot(xnormID.pca, type = "l")
library("FactoMineR")
resID.pca=PCA(Absenteeism_complete_UniqueIDConStand_sel, ncp = 13, graph = FALSE)
library("factoextra")
fviz_eig(resID.pca, addlabels = TRUE, ylim = c(0, 50))
And in particular:
eigID.val <- get_eigenvalue(resID.pca)
eigID.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.1338030 17.781692 17.78169
## Dim.2 1.8236996 15.197496 32.97919
## Dim.3 1.5649500 13.041250 46.02044
## Dim.4 1.4519619 12.099682 58.12012
## Dim.5 1.1337719 9.448099 67.56822
## Dim.6 0.9782278 8.151898 75.72012
## Dim.7 0.7660141 6.383451 82.10357
## Dim.8 0.6511077 5.425897 87.52947
## Dim.9 0.5731292 4.776076 92.30554
## Dim.10 0.4866635 4.055529 96.36107
## Dim.11 0.2482998 2.069165 98.43024
## Dim.12 0.1883717 1.569764 100.00000
#code to save the scores that we will need later in the clustering analysis maybe
matID=xnormID.pca$scores
#dim(mat)
#matcomp9ID=matID[, 1:9]
matcomp10ID=matID[, 1:10]
xnormID.pca$loadings #results princcomp command. NB: the compenents are standardized! we can consider the loading relatively to the component but not absolutely. it is normal that SS loadings is equal to 1
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## Avg.Absenteeism.time.in.hours 0.321 0.419 -0.255 -0.216 0.188
## Avg.Hour.Work.load.Average.day -0.274 0.247 -0.449 0.217 -0.510
## Avg.Hit.target 0.478 0.171 0.321 -0.350
## Transportation.expense -0.501 -0.407 -0.208
## Distance.from.Residence.to.Work -0.311 -0.420 -0.207 -0.151 0.271
## Service.time -0.167 0.367 0.437 0.431
## Son 0.334 -0.261 -0.202 0.596 0.152
## Pet -0.201 -0.213 -0.339 0.452 0.311 -0.138
## Height 0.483 0.147 -0.274
## BMI -0.446 0.280 0.163 0.264 -0.332
## Freq.failure -0.244 0.323 -0.110 -0.307 -0.341 -0.375
## First.start 0.281 -0.381 0.343 -0.405
## Comp.7 Comp.8 Comp.9 Comp.10 Comp.11
## Avg.Absenteeism.time.in.hours 0.175 -0.406 0.314 0.398
## Avg.Hour.Work.load.Average.day 0.107 0.290 -0.442 -0.153
## Avg.Hit.target 0.374 -0.324 0.162 0.457
## Transportation.expense 0.175 -0.276 -0.609
## Distance.from.Residence.to.Work 0.147 0.277 0.667
## Service.time 0.335 -0.164 0.147 0.155
## Son -0.419 0.186 -0.428
## Pet -0.264 -0.269 0.209 -0.371 0.267
## Height -0.365 -0.530 0.356 -0.277
## BMI -0.113 -0.308 -0.288
## Freq.failure -0.414 0.267 0.246 0.159 0.318
## First.start 0.478 0.183 0.252 -0.378
## Comp.12
## Avg.Absenteeism.time.in.hours -0.331
## Avg.Hour.Work.load.Average.day -0.200
## Avg.Hit.target -0.169
## Transportation.expense 0.203
## Distance.from.Residence.to.Work -0.178
## Service.time 0.510
## Son
## Pet 0.291
## Height 0.176
## BMI -0.567
## Freq.failure 0.209
## First.start
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.083 0.083 0.083 0.083 0.083 0.083 0.083 0.083
## Cumulative Var 0.083 0.167 0.250 0.333 0.417 0.500 0.583 0.667
## Comp.9 Comp.10 Comp.11 Comp.12
## SS loadings 1.000 1.000 1.000 1.000
## Proportion Var 0.083 0.083 0.083 0.083
## Cumulative Var 0.750 0.833 0.917 1.000
Considering the loadings and its interpretation, looking to the histograms contribution:
# Contributions of variables to PC1
fviz_contrib(resID.pca, choice = "var", axes = 1, top = 12)

# Contributions of variables to PC2
fviz_contrib(resID.pca, choice = "var", axes = 2, top = 12)

fviz_contrib(resID.pca, choice = "var", axes = 3, top = 13)

fviz_contrib(resID.pca, choice = "var", axes = 4, top = 12)

fviz_contrib(resID.pca, choice = "var", axes = 5, top = 12)

fviz_contrib(resID.pca, choice = "var", axes = 6, top = 12)

fviz_contrib(resID.pca, choice = "var", axes = 7, top = 12)

fviz_contrib(resID.pca, choice = "var", axes = 8, top = 12)

fviz_contrib(resID.pca, choice = "var", axes = 9, top = 12)

fviz_contrib(resID.pca, choice = "var", axes = 10, top = 12)

fviz_contrib(resID.pca, choice = "var", axes = 11, top = 12)

fviz_contrib(resID.pca, choice = "var", axes = 12, top = 12)

library("corrplot")
corrplot(resID.pca$var$contrib, is.corr=FALSE)

fviz_contrib(resID.pca, choice = "var", axes = 1:10, top = 12)

library("corrplot")
corrplot(resID.pca$var$cos2, is.corr=FALSE)

# Total cos2 of variables on Dim.1/ dim 9
fviz_cos2(resID.pca, choice = "var", axes = 1:10)

# Color by cos2 values: quality on the factor map
fviz_pca_var(resID.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)

Factor Analysis UniqueID
Rid=cor(Absenteeism_complete_UniqueIDConStand_sel)
library(psych)
KMO(Rid)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = Rid)
## Overall MSA = 0.37
## MSA for each item =
## Avg.Absenteeism.time.in.hours Avg.Hour.Work.load.Average.day
## 0.37 0.34
## Avg.Hit.target Transportation.expense
## 0.42 0.38
## Distance.from.Residence.to.Work Service.time
## 0.54 0.28
## Son Pet
## 0.28 0.32
## Height BMI
## 0.50 0.36
## Freq.failure First.start
## 0.34 0.33
We will not go ahead because the overall MSA is 0.37.
Clustering Analysis Unique ID
Since the data set is small and there is no variability we will continue to use the distances rank based.
Number of clusters
library("NbClust")
library(FactoMineR)
library(factoextra)
# Elbow method
fviz_nbclust(matcomp10ID, hcut, method = "wss", diss=get_dist(matcomp10ID, method="spearman")) +
#geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")

6 is the number suggested.
# Silhouette method
fviz_nbclust(matcomp10ID, hcut, method = "silhouette", diss=get_dist(matcomp10ID, method="spearman")) +
#geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "silhouette")

2 is the number suggested.
We will proceed with two and/or six.
# Kendall and Spearman
library(factoextra)
dSpearmID=get_dist(matcomp10ID, method = "spearman")
dKendID=get_dist(matcomp10ID, method = "kendall")
library(fpc)
clmSpearm2ID=pamk(dSpearmID, k=2, criterion="asw", usepam=TRUE,
scaling=FALSE, alpha=0.001, diss=TRUE,
critout=FALSE, ns=10, seed=NULL)
table(clmSpearm2ID$pamobject$clustering)
##
## 1 2
## 16 20
table(clmSpearm2ID$pamobject$clustering, Absenteeism_complete_UniqueID$Freq.absence)
##
## 0 2 4 5 6 7 8 10 13 14 20 22 23 24 28 29 35 38 39 46 54 75 112
## 1 1 3 1 0 1 2 0 1 0 1 0 1 0 1 0 0 0 0 0 1 1 1 1
## 2 2 1 2 3 0 0 2 0 1 1 1 0 1 0 1 2 1 1 1 0 0 0 0
Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2<-as.factor(clmSpearm2ID$pamobject$clustering)
#str(Absenteeism_complete_UniqueIDConStand_sel)
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,1]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 4.164 4.164
## Residuals 34 30.836 0.907
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 4.591 0.0394 *
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,2]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 0.22 0.2224
## Residuals 34 34.78 1.0229
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 0.217 0.644
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,3]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 2.45 2.4545
## Residuals 34 32.55 0.9572
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 2.564 0.119
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,4]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 0 0.0018
## Residuals 34 35 1.0294
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 0.002 0.967
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,5]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 0.45 0.4453
## Residuals 34 34.55 1.0163
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 0.438 0.512
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,6]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 10.47 10.472
## Residuals 34 24.53 0.721
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 14.52 0.000556
## Residuals
##
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,7]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 13.66 13.659
## Residuals 34 21.34 0.628
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 21.76 4.65e-05
## Residuals
##
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,8]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 0.15 0.1536
## Residuals 34 34.85 1.0249
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 0.15 0.701
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,9]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 2.38 2.3803
## Residuals 34 32.62 0.9594
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 2.481 0.124
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,10]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 3.18 3.181
## Residuals 34 31.82 0.936
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 3.399 0.074 .
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,11]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 3.33 3.329
## Residuals 34 31.67 0.932
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 3.574 0.0673 .
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,12]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 0.2 0.196
## Residuals 34 34.8 1.024
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 0.191 0.664
## Residuals
#str(Absenteeism_complete_UniqueIDCon)
summary(aov(Absenteeism_complete_UniqueIDCon[,16]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1 679 678.6
## Residuals 34 18773 552.2
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed2 1.229 0.275
## Residuals
Service.time and Son are the only variable for which is significant the belonging at the clusters or not.
Freq.absence has p-value 0.275, high.
fviz_cluster(object=list(data=Absenteeism_complete_UniqueIDConStand_sel[,1:12], cluster=clmSpearm2ID$pamobject$clustering), repel=TRUE, show.clust.cent=TRUE , palette="NULL",ggthem=theme_minimal(), main="K-medoids with Spearman Distance of the PC", geom=c("point"), ellipse=TRUE)

clmSpearm6ID=pamk(dSpearmID, k=6, criterion="asw", usepam=TRUE,
scaling=FALSE, alpha=0.001, diss=TRUE,
critout=FALSE, ns=10, seed=NULL)
table(clmSpearm6ID$pamobject$clustering)
##
## 1 2 3 4 5 6
## 6 5 6 4 7 8
table(clmSpearm6ID$pamobject$clustering, Absenteeism_complete_UniqueID$Freq.absence)
##
## 0 2 4 5 6 7 8 10 13 14 20 22 23 24 28 29 35 38 39 46 54 75 112
## 1 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0
## 2 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0
## 3 2 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1
## 4 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
## 5 0 0 1 2 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0
## 6 0 2 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0
Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6<-as.factor(clmSpearm2ID$pamobject$clustering)
#str(Absenteeism_complete_UniqueIDConStand_sel)
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,1]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 4.164 4.164
## Residuals 34 30.836 0.907
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 4.591 0.0394 *
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,2]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 0.22 0.2224
## Residuals 34 34.78 1.0229
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 0.217 0.644
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,3]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 2.45 2.4545
## Residuals 34 32.55 0.9572
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 2.564 0.119
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,4]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 0 0.0018
## Residuals 34 35 1.0294
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 0.002 0.967
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,5]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 0.45 0.4453
## Residuals 34 34.55 1.0163
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 0.438 0.512
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,6]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 10.47 10.472
## Residuals 34 24.53 0.721
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 14.52 0.000556
## Residuals
##
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,7]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 13.66 13.659
## Residuals 34 21.34 0.628
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 21.76 4.65e-05
## Residuals
##
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,8]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 0.15 0.1536
## Residuals 34 34.85 1.0249
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 0.15 0.701
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,9]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 2.38 2.3803
## Residuals 34 32.62 0.9594
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 2.481 0.124
## Residuals
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,10]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 3.18 3.181
## Residuals 34 31.82 0.936
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 3.399 0.074 .
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,11]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 3.33 3.329
## Residuals 34 31.67 0.932
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 3.574 0.0673 .
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(Absenteeism_complete_UniqueIDConStand_sel[,12]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 0.2 0.196
## Residuals 34 34.8 1.024
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 0.191 0.664
## Residuals
#str(Absenteeism_complete_UniqueIDCon)
summary(aov(Absenteeism_complete_UniqueIDCon[,16]~Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6,data=Absenteeism_complete_UniqueIDConStand_sel))
## Df Sum Sq Mean Sq
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1 679 678.6
## Residuals 34 18773 552.2
## F value Pr(>F)
## Absenteeism_complete_UniqueIDConStand_sel$clusterKMed6 1.229 0.275
## Residuals
Same as number of clusters equals to 2.
The clusters with high frequencies have also some obs with low frequencies: what this means? it is impossible to catch a pattern also with 6 clusters (a lot, since the number of rows are 36) !
MCA for freq.absence for UniqueID
str(Absenteeism_complete_UniqueID)
## 'data.frame': 36 obs. of 21 variables:
## $ ID : Factor w/ 36 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Sum.Absenteeism.time.in.hours : int 121 25 482 0 104 72 30 0 262 186 ...
## $ Avg.Absenteeism.time.in.hours : num 5.5 6.25 4.3 0 7.43 ...
## $ Avg.Hour.Work.load.Average.day : num 4.4 3.53 4.37 4.52 4.38 ...
## $ Sum.Number.of.days.absent : num 27.74 7.25 109.1 0 24.13 ...
## $ Avg.Number.of.days.absent : num 1.261 1.813 0.974 0 1.724 ...
## $ Avg.Hit.target : num 95 92 95.1 95 93.4 ...
## $ Transportation.expense : int 235 235 179 118 235 189 279 231 228 361 ...
## $ Distance.from.Residence.to.Work: int 11 29 51 14 20 29 5 35 14 52 ...
## $ Service.time : int 14 12 18 13 13 13 14 14 16 3 ...
## $ Age : int 37 48 38 40 43 33 39 39 58 28 ...
## $ Education : Ord.factor w/ 3 levels "High School"<..: 3 1 1 1 1 1 1 1 1 1 ...
## $ Son : int 1 1 0 1 1 2 2 2 2 1 ...
## $ Bad.habits : chr "None" "Smoker" "Drinker" "Drinker" ...
## $ Pet : int 1 5 0 8 0 2 0 2 1 4 ...
## $ Weight : int 88 88 89 98 106 69 68 100 65 80 ...
## $ Height : int 172 163 170 170 167 167 168 170 172 172 ...
## $ BMI : num 29.8 33.1 30.8 33.9 38 ...
## $ Freq.absence : num 22 4 112 0 14 8 4 0 8 24 ...
## $ Freq.failure : int 1 2 1 0 5 0 2 1 0 0 ...
## $ First.start : int 23 36 20 27 30 20 25 25 42 25 ...
par(mfrow=c(2,1))
hist( Absenteeism_complete_UniqueID$Freq.absence, breaks = sqrt( length( Absenteeism_complete_UniqueID$Freq.absence) ), probability = TRUE,
col = 'lavender', main = 'Freq.absence', xlab = 'Freq.absence' )
boxplot(Absenteeism_complete_UniqueID$Freq.absence)

#discretize freq absence
Absenteeism_complete_UniqueID$Freq.absenceBin<-Recode(Absenteeism_complete_UniqueID$Freq.absence,"0:20='VeryLow';20:40='Mid';40:120='High'")
table (Absenteeism_complete_UniqueID$Freq.absenceBin)
##
## High Mid VeryLow
## 4 9 23
UniqIDcat=subset(Absenteeism_complete_UniqueID, select=c(Education, Bad.habits, Freq.absenceBin))
tab1=table(UniqIDcat$Education, UniqIDcat$Bad.habits)
chisq.test(tab1)
##
## Pearson's Chi-squared test
##
## data: tab1
## X-squared = 5.3388, df = 6, p-value = 0.5012
tab2=table(UniqIDcat$Education,UniqIDcat$Freq.absenceBin )
chisq.test(tab2)
##
## Pearson's Chi-squared test
##
## data: tab2
## X-squared = 3.2671, df = 4, p-value = 0.5142
tab3=table(UniqIDcat$Bad.habits, UniqIDcat$Freq.absenceBin)
chisq.test(tab3)
##
## Pearson's Chi-squared test
##
## data: tab3
## X-squared = 9.7207, df = 6, p-value = 0.1369
seems there are no relations, we will not go ahead.
Annex
Clustering analysis with original variables
Without PCA! Original and standardized variables.
#str(AbsenteeismCont_Norm_presel)
library("NbClust")
library(FactoMineR)
library(factoextra)
# Elbow method
fviz_nbclust(AbsenteeismCont_Norm_presel, hcut, method = "wss", diss=get_dist(AbsenteeismCont_Norm_presel, method="spearman")) +
#geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")

Elbow in 3 or in 6 or in 7.
# Silhouette method
fviz_nbclust(AbsenteeismCont_Norm_presel, hcut, method = "silhouette", diss=get_dist(AbsenteeismCont_Norm_presel, method="spearman") )+
labs(subtitle = "Silhouette method")

Let’s try 3 and 7 ##Clustering algorithms
Hierarchical
The distance will be apply to the original and standardized variables. We chose 3 as number of clusters with the distance of Spearman and 7 as number of clusters with the distance of Kendall.
NB. run this file after the cluster with pca otherwise will be a messy
#d<-dist(matcomp9)
# Kendall and Spearman
library(factoextra)
dSpearmOr=get_dist(AbsenteeismCont_Norm_presel, method = "spearman")
dKendOr=get_dist(AbsenteeismCont_Norm_presel, method = "kendall")
fviz_dist(dSpearmOr)

fviz_dist(dKendOr)

fit_ward<-hclust(dSpearmOr,method="ward.D")
plot(fit_ward)
rect.hclust(fit_ward, k=3,border="red")

groups_wardSpearmOr <- cutree(fit_ward, k=3)
AbsenteeismCont$groups_wardSpearmOr<-groups_wardSpearmOr #create the column the the clusters of ward
table(groups_wardSpearmOr)
## groups_wardSpearmOr
## 1 2 3
## 266 107 294
table(AbsenteeismCont$Hoursgroup,AbsenteeismCont$groups_wardSpearmOr)
##
## 1 2 3
## 1hours 32 18 36
## 2hours 58 26 73
## lotofhours 124 21 104
## midtimeinhours 52 42 81
table(Absenteeism_Clustering$Absenteeism.time.in.hours,AbsenteeismCont$groups_wardSpearmOr)
##
## 1 2 3
## 1 32 18 36
## 2 58 26 73
## 3 30 32 47
## 4 20 10 28
## 5 2 0 5
## 7 0 0 1
## 8 103 16 85
## 16 8 1 9
## 24 7 3 5
## 32 3 1 1
## 40 2 0 4
## 48 1 0 0
fit_ward<-hclust(dKendOr,method="ward.D")
plot(fit_ward)
rect.hclust(fit_ward, k=7,border="red")

groups_wardKendOr <- cutree(fit_ward, k=7)
AbsenteeismCont$groups_wardKendOr<-groups_wardKendOr #create the column the the clusters of ward
table(groups_wardKendOr)
## groups_wardKendOr
## 1 2 3 4 5 6 7
## 127 107 63 174 45 74 77
table(AbsenteeismCont$Hoursgroup,AbsenteeismCont$groups_wardKendOr)
##
## 1 2 3 4 5 6 7
## 1hours 12 18 7 16 7 13 13
## 2hours 16 26 18 27 21 25 24
## lotofhours 77 21 30 84 9 10 18
## midtimeinhours 22 42 8 47 8 26 22
table(Absenteeism_Clustering$Absenteeism.time.in.hours,AbsenteeismCont$groups_wardKendOr)
##
## 1 2 3 4 5 6 7
## 1 12 18 7 16 7 13 13
## 2 16 26 18 27 21 25 24
## 3 10 32 4 20 6 21 16
## 4 12 10 3 24 0 4 5
## 5 0 0 1 2 2 1 1
## 7 0 0 0 1 0 0 0
## 8 67 16 22 70 5 9 15
## 16 6 1 2 5 3 1 0
## 24 3 3 2 4 1 0 2
## 32 1 1 1 1 0 0 1
## 40 0 0 2 4 0 0 0
## 48 0 0 1 0 0 0 0
# single linkage
fit_single<-hclust(dSpearmOr, method="single")
plot(fit_single)
rect.hclust(fit_single, k=3, border="red")

groups_singleSpearmOr <- cutree(fit_single, k=3)
AbsenteeismCont$groups_singleSpearmOr<-groups_singleSpearmOr
table(groups_singleSpearmOr)
## groups_singleSpearmOr
## 1 2 3
## 486 107 74
table(AbsenteeismCont$Hoursgroup,groups_singleSpearmOr)
## groups_singleSpearmOr
## 1 2 3
## 1hours 55 18 13
## 2hours 106 26 25
## lotofhours 218 21 10
## midtimeinhours 107 42 26
table(Absenteeism_Clustering$Absenteeism.time.in.hours,AbsenteeismCont$groups_singleSpearmOr)
##
## 1 2 3
## 1 55 18 13
## 2 106 26 25
## 3 56 32 21
## 4 44 10 4
## 5 6 0 1
## 7 1 0 0
## 8 179 16 9
## 16 16 1 1
## 24 12 3 0
## 32 4 1 0
## 40 6 0 0
## 48 1 0 0
# single linkage
fit_single<-hclust(dKendOr, method="single")
plot(fit_single)
rect.hclust(fit_single, k=7, border="red")

groups_singleKendOr <- cutree(fit_single, k=7)
AbsenteeismCont$groups_singleKendOr<-groups_singleKendOr
table(groups_singleKendOr)
## groups_singleKendOr
## 1 2 3 4 5 6 7
## 218 107 109 37 45 74 77
table(AbsenteeismCont$Hoursgroup,groups_singleKendOr)
## groups_singleKendOr
## 1 2 3 4 5 6 7
## 1hours 21 18 14 0 7 13 13
## 2hours 37 26 23 1 21 25 24
## lotofhours 125 21 51 15 9 10 18
## midtimeinhours 35 42 21 21 8 26 22
table(Absenteeism_Clustering$Absenteeism.time.in.hours,AbsenteeismCont$groups_singleKendOr)
##
## 1 2 3 4 5 6 7
## 1 21 18 14 0 7 13 13
## 2 37 26 23 1 21 25 24
## 3 18 32 8 8 6 21 16
## 4 16 10 10 13 0 4 5
## 5 1 0 2 0 2 1 1
## 7 0 0 1 0 0 0 0
## 8 105 16 42 12 5 9 15
## 16 9 1 2 2 3 1 0
## 24 6 3 3 0 1 0 2
## 32 2 1 1 0 0 0 1
## 40 2 0 3 1 0 0 0
## 48 1 0 0 0 0 0 0
# complete linkage
fit_complete<-hclust(dSpearmOr, method="complete")
plot(fit_complete)
rect.hclust(fit_complete, k=3, border="red")

groups_completeSpearmOr <- cutree(fit_complete, k=3)
AbsenteeismCont$groups_completeSpearmOr<-groups_completeSpearmOr
table(groups_completeSpearmOr)
## groups_completeSpearmOr
## 1 2 3
## 293 226 148
table(AbsenteeismCont$Hoursgroup,groups_completeSpearmOr)
## groups_completeSpearmOr
## 1 2 3
## 1hours 28 38 20
## 2hours 43 72 42
## lotofhours 153 40 56
## midtimeinhours 69 76 30
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_completeSpearmOr)
## groups_completeSpearmOr
## 1 2 3
## 1 28 38 20
## 2 43 72 42
## 3 30 59 20
## 4 36 14 8
## 5 2 3 2
## 7 1 0 0
## 8 130 30 44
## 16 10 5 3
## 24 7 4 4
## 32 2 1 2
## 40 4 0 2
## 48 0 0 1
# complete linkage
fit_complete<-hclust(dKendOr, method="complete")
plot(fit_complete)
rect.hclust(fit_complete, k=7, border="red")

groups_completeKendOr <- cutree(fit_complete, k=7)
AbsenteeismCont$groups_completeKendOr<-groups_completeKendOr
table(groups_completeKendOr)
## groups_completeKendOr
## 1 2 3 4 5 6 7
## 98 129 152 77 60 74 77
table(AbsenteeismCont$Hoursgroup,groups_completeKendOr)
## groups_completeKendOr
## 1 2 3 4 5 6 7
## 1hours 11 8 25 6 10 13 13
## 2hours 22 13 47 11 15 25 24
## lotofhours 49 73 30 46 23 10 18
## midtimeinhours 16 35 50 14 12 26 22
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_completeKendOr)
## groups_completeKendOr
## 1 2 3 4 5 6 7
## 1 11 8 25 6 10 13 13
## 2 22 13 47 11 15 25 24
## 3 7 15 38 9 3 21 16
## 4 8 20 10 4 7 4 5
## 5 1 0 2 0 2 1 1
## 7 0 0 0 1 0 0 0
## 8 39 62 21 42 16 9 15
## 16 4 6 4 2 1 1 0
## 24 2 3 4 1 3 0 2
## 32 1 1 1 0 1 0 1
## 40 2 1 0 1 2 0 0
## 48 1 0 0 0 0 0 0
# average linkage
fit_average<-hclust(dSpearmOr, method="average")
plot(fit_average)
rect.hclust(fit_average, k=3, border="red")

groups_averageSpearmOr <- cutree(fit_average, k=3)
table(groups_averageSpearmOr)
## groups_averageSpearmOr
## 1 2 3
## 301 226 140
AbsenteeismCont$groups_averageSpearmOr<-groups_averageSpearmOr
table(AbsenteeismCont$Hoursgroup,groups_averageSpearmOr)
## groups_averageSpearmOr
## 1 2 3
## 1hours 28 38 20
## 2hours 43 72 42
## lotofhours 161 40 48
## midtimeinhours 69 76 30
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_averageSpearmOr)
## groups_averageSpearmOr
## 1 2 3
## 1 28 38 20
## 2 43 72 42
## 3 30 59 20
## 4 36 14 8
## 5 2 3 2
## 7 1 0 0
## 8 137 30 37
## 16 11 5 2
## 24 7 4 4
## 32 2 1 2
## 40 4 0 2
## 48 0 0 1
# average linkage
fit_average<-hclust(dKendOr, method="average")
plot(fit_average)
rect.hclust(fit_average, k=7, border="red")

groups_averageKendOr <- cutree(fit_average, k=7)
table(groups_averageKendOr)
## groups_averageKendOr
## 1 2 3 4 5 6 7
## 127 152 30 77 110 97 74
AbsenteeismCont$groups_averageKendOr<-groups_averageKendOr
table(AbsenteeismCont$Hoursgroup,groups_averageKendOr)
## groups_averageKendOr
## 1 2 3 4 5 6 7
## 1hours 12 25 4 6 16 10 13
## 2hours 16 47 9 11 33 16 25
## lotofhours 77 30 14 46 34 38 10
## midtimeinhours 22 50 3 14 27 33 26
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_averageKendOr)
## groups_averageKendOr
## 1 2 3 4 5 6 7
## 1 12 25 4 6 16 10 13
## 2 16 47 9 11 33 16 25
## 3 10 38 3 9 17 11 21
## 4 12 10 0 4 8 20 4
## 5 0 2 0 0 2 2 1
## 7 0 0 0 1 0 0 0
## 8 67 21 13 42 24 28 9
## 16 6 4 0 2 2 3 1
## 24 3 4 0 1 4 3 0
## 32 1 1 0 0 2 1 0
## 40 0 0 1 1 1 3 0
## 48 0 0 0 0 1 0 0
# centroid method
fit_centroid<-hclust(dSpearmOr, method="centroid")
plot(fit_centroid)
rect.hclust(fit_centroid, k=3, border="red")

groups_centroidSpearmOr <- cutree(fit_centroid, k=3)
table(groups_centroidSpearmOr)
## groups_centroidSpearmOr
## 1 2 3
## 486 107 74
AbsenteeismCont$groups_centroidSpearmOr<-groups_centroidSpearmOr
table(AbsenteeismCont$Hoursgroup,groups_centroidSpearmOr)
## groups_centroidSpearmOr
## 1 2 3
## 1hours 55 18 13
## 2hours 106 26 25
## lotofhours 218 21 10
## midtimeinhours 107 42 26
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_centroidSpearmOr)
## groups_centroidSpearmOr
## 1 2 3
## 1 55 18 13
## 2 106 26 25
## 3 56 32 21
## 4 44 10 4
## 5 6 0 1
## 7 1 0 0
## 8 179 16 9
## 16 16 1 1
## 24 12 3 0
## 32 4 1 0
## 40 6 0 0
## 48 1 0 0
# centroid method
fit_centroid<-hclust(dKendOr, method="centroid")
plot(fit_centroid)
rect.hclust(fit_centroid, k=7, border="red")

groups_centroidKendOr <- cutree(fit_centroid, k=7)
table(groups_centroidKendOr)
## groups_centroidKendOr
## 1 2 3 4 5 6 7
## 331 107 33 45 74 52 25
AbsenteeismCont$groups_centroidKendOr<-groups_centroidKendOr
table(AbsenteeismCont$Hoursgroup,groups_centroidKendOr)
## groups_centroidKendOr
## 1 2 3 4 5 6 7
## 1hours 32 18 3 7 13 4 9
## 2hours 52 26 9 21 25 19 5
## lotofhours 175 21 16 9 10 13 5
## midtimeinhours 72 42 5 8 26 16 6
table(Absenteeism_Clustering$Absenteeism.time.in.hours,groups_centroidKendOr)
## groups_centroidKendOr
## 1 2 3 4 5 6 7
## 1 32 18 3 7 13 4 9
## 2 52 26 9 21 25 19 5
## 3 33 32 1 6 21 11 5
## 4 36 10 3 0 4 4 1
## 5 2 0 1 2 1 1 0
## 7 1 0 0 0 0 0 0
## 8 150 16 9 5 9 12 3
## 16 11 1 2 3 1 0 0
## 24 7 3 2 1 0 0 2
## 32 2 1 1 0 0 1 0
## 40 5 0 1 0 0 0 0
## 48 0 0 1 0 0 0 0
Partitioning algorithms
clkOr=kmeans(AbsenteeismCont_Norm_presel, 3, iter.max = 100, nstart =2365 ,
algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"), trace=FALSE)
#clkOr
Looking that between_SS / total_SS = 29.6 %
AbsenteeismCont$clusterKMOr<-as.factor(clkOr$cluster)
table(AbsenteeismCont$clusterKMOr,AbsenteeismCont$Hoursgroup)
##
## 1hours 2hours lotofhours midtimeinhours
## 1 19 28 22 43
## 2 35 77 104 58
## 3 32 52 123 74
table(AbsenteeismCont$clusterKMOr,AbsenteeismCont$Absenteeism.time.in.hours)
##
## 1 2 3 4 5 7 8 16 24 32 40 48
## 1 19 28 33 10 0 0 17 1 3 1 0 0
## 2 35 77 36 18 4 0 80 10 9 2 2 1
## 3 32 52 40 30 3 1 107 7 3 2 4 0
table(Absenteeism_Clustering$Reason.for.absence.short,AbsenteeismCont$clusterKMOr )
##
## 1 2 3
## Accompanying person 0 6 32
## Dental consultation 27 33 47
## Diagnosis, donation and vaccination 4 24 12
## Diseases 22 93 60
## Injury, poisoning 0 13 21
## Medical consultation 18 54 75
## Physiotheraphy 38 26 4
## Pregnancy, childbirth, perinatal complications 0 4 2
## Symptons and abnormal exams 2 11 7
## Unjustified 1 10 21
- K-MEDOIDS 3 clusters with sperman distance
library(fpc)
clmSpearmOr=pamk(dSpearmOr, k=3, criterion="asw", usepam=TRUE,
scaling=FALSE, alpha=0.001, diss=TRUE,
critout=FALSE, ns=10, seed=NULL)
table(clmSpearmOr$pamobject$clustering)
##
## 1 2 3
## 308 209 150
AbsenteeismCont$clusterKMedOr<-as.factor(clmSpearmOr$pamobject$clustering)
table(clmSpearmOr$pamobject$clustering)
##
## 1 2 3
## 308 209 150
table(clmSpearmOr$pamobject$clustering, AbsenteeismCont$Hoursgroup)
##
## 1hours 2hours lotofhours midtimeinhours
## 1 36 52 141 79
## 2 27 59 75 48
## 3 23 46 33 48
table(clmSpearmOr$pamobject$clustering, AbsenteeismCont$Absenteeism.time.in.hours)
##
## 1 2 3 4 5 7 8 16 24 32 40 48
## 1 36 52 41 34 3 1 122 9 4 2 4 0
## 2 27 59 33 13 2 0 57 6 7 2 2 1
## 3 23 46 35 11 2 0 25 3 4 1 0 0
#table(clmSpearmOr$pamobject$clustering, AbsenteeismCont$Freq.absence)
Considering the reason:
table( Absenteeism_Clustering$Reason.for.absence.short,clmSpearmOr$pamobject$clustering)
##
## 1 2 3
## Accompanying person 33 5 0
## Dental consultation 49 23 35
## Diagnosis, donation and vaccination 18 18 4
## Diseases 73 70 32
## Injury, poisoning 23 9 2
## Medical consultation 78 43 26
## Physiotheraphy 4 17 47
## Pregnancy, childbirth, perinatal complications 4 2 0
## Symptons and abnormal exams 9 9 2
## Unjustified 17 13 2
#clmSpearmOr$pamobject$clustering=as.factor(clmSpearmOr$pamobject$clustering)
#str(AbsenteeismCont_Norm_presel)
fviz_cluster(object=list(data=AbsenteeismCont_Norm_presel[,1:12], cluster=clmSpearmOr$pamobject$clustering), repel=TRUE, show.clust.cent=TRUE , palette="NULL",ggthem=theme_minimal(), main="K-medoids with Spearman Distance of the PC", geom=c("point"), ellipse=TRUE)

- K-MEDOIDS with 7 clusters and Kendall’s distance
clmKenOr=pamk(dKendOr, k=7, criterion="asw", usepam=TRUE,
scaling=FALSE, alpha=0.001, diss=TRUE,
critout=FALSE, ns=10, seed=NULL)
table(clmKenOr$pamobject$clustering)
##
## 1 2 3 4 5 6 7
## 116 124 108 131 47 76 65
AbsenteeismCont$clusterKMedOr<-as.factor(clmKenOr$pamobject$clustering)
table(clmKenOr$pamobject$clustering)
##
## 1 2 3 4 5 6 7
## 116 124 108 131 47 76 65
table(clmKenOr$pamobject$clustering, AbsenteeismCont$Hoursgroup)
##
## 1hours 2hours lotofhours midtimeinhours
## 1 11 12 74 19
## 2 18 38 37 31
## 3 18 26 22 42
## 4 17 25 67 22
## 5 1 3 20 23
## 6 13 25 12 26
## 7 8 28 17 12
table(clmKenOr$pamobject$clustering, AbsenteeismCont$Absenteeism.time.in.hours)
##
## 1 2 3 4 5 7 8 16 24 32 40 48
## 1 11 12 7 11 0 1 65 4 3 1 1 0
## 2 18 38 20 9 2 0 27 2 4 2 1 1
## 3 18 26 32 10 0 0 17 1 3 1 0 0
## 4 17 25 10 10 2 0 56 4 4 1 2 0
## 5 1 3 9 14 0 0 17 2 0 0 1 0
## 6 13 25 21 4 1 0 11 1 0 0 0 0
## 7 8 28 10 0 2 0 11 4 1 0 1 0
table(clmKenOr$pamobject$clustering, AbsenteeismCont$Freq.absence)
##
## 2 4 5 6 7 8 10 13 14 20 22 23 24 28 29 35 38
## 1 1 5 5 0 0 4 3 12 11 0 16 10 13 0 0 0 36
## 2 2 4 0 0 0 0 0 0 0 0 6 8 0 25 27 0 0
## 3 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 5 3 7 1 11 8 7 0 16 0 0 5 10 0 27 31 0
## 5 0 0 0 1 0 2 0 0 0 7 0 0 0 0 0 0 0
## 6 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 7 0 0 0 4 3 0 0 0 0 13 0 0 0 0 0 0 0
##
## 39 46 54 75 112
## 1 0 0 0 0 0
## 2 0 0 52 0 0
## 3 0 0 0 0 107
## 4 0 0 0 0 0
## 5 37 0 0 0 0
## 6 0 0 0 74 0
## 7 0 45 0 0 0